From 5dc0aae2f900064d1f58579929a2285ab289a436 Mon Sep 17 00:00:00 2001
From: Laurent Aimar <fenrir@videolan.org>
Date: Thu, 3 Jun 2004 19:29:33 +0000
Subject: [PATCH 1/1]  * all: re-import of the CVS.

git-svn-id: svn://svn.videolan.org/x264/trunk@1 df754926-b1dd-0310-bc7b-ec298dee348c
---
 .cvsignore                      |    3 +
 AUTHORS                         |   35 +
 COPYING                         |  340 ++++
 Jamfile                         |   67 +
 Makefile                        |   58 +
 Makefile.cygwin                 |   52 +
 TODO                            |   65 +
 build/cygwin/Makefile           |  102 ++
 build/win32/libx264.dsp         |  742 +++++++++
 build/win32/x264.dsp            |   94 ++
 build/win32/x264.dsw            |   44 +
 core/bs.h                       |  423 +++++
 core/cabac.c                    | 1044 ++++++++++++
 core/cabac.h                    |   78 +
 core/clip1.h                    |   71 +
 core/common.c                   |  300 ++++
 core/common.h                   |  344 ++++
 core/cpu.c                      |  233 +++
 core/cpu.h                      |   32 +
 core/csp.c                      |  379 +++++
 core/csp.h                      |   43 +
 core/dct.c                      |  288 ++++
 core/dct.h                      |   49 +
 core/frame.c                    |  701 ++++++++
 core/frame.h                    |   56 +
 core/i386/cpu.asm               |  111 ++
 core/i386/dct-c.c               |  294 ++++
 core/i386/dct.asm               |  313 ++++
 core/i386/dct.h                 |   38 +
 core/i386/mc-c.c                |  940 +++++++++++
 core/i386/mc.asm                |  187 +++
 core/i386/mc.h                  |   29 +
 core/i386/pixel.asm             |  705 ++++++++
 core/i386/pixel.h               |   43 +
 core/i386/predict.c             |  429 +++++
 core/i386/predict.h             |   31 +
 core/macroblock.c               | 1029 ++++++++++++
 core/macroblock.h               |  204 +++
 core/mc.c                       |  320 ++++
 core/mc.h                       |   45 +
 core/mdate.c                    |   48 +
 core/pixel.c                    |  228 +++
 core/pixel.h                    |   62 +
 core/ppc/mc.c                   |  681 ++++++++
 core/ppc/mc.h                   |   29 +
 core/ppc/pixel.c                |  215 +++
 core/ppc/pixel.h                |   29 +
 core/ppc/ppccommon.h            |  158 ++
 core/predict.c                  |  697 ++++++++
 core/predict.h                  |   92 ++
 core/set.h                      |  123 ++
 core/vlc.h                      |  914 +++++++++++
 decoder/decoder.c               |  772 +++++++++
 decoder/macroblock.c            | 1097 +++++++++++++
 decoder/macroblock.h            |   34 +
 decoder/set.c                   |  262 +++
 decoder/set.h                   |   33 +
 decoder/vlc.c                   |  236 +++
 decoder/vlc.h                   |   46 +
 doc/dct.txt                     |  111 ++
 encoder/analyse.c               | 1224 ++++++++++++++
 encoder/analyse.h               |   29 +
 encoder/cabac.c                 | 1195 ++++++++++++++
 encoder/cavlc.c                 |  688 ++++++++
 encoder/encoder.c               | 1235 ++++++++++++++
 encoder/macroblock.c            |  859 ++++++++++
 encoder/macroblock.h            |   38 +
 encoder/me.c                    |  194 +++
 encoder/me.h                    |   52 +
 encoder/ratecontrol.c           |   96 ++
 encoder/ratecontrol.h           |   52 +
 encoder/set.c                   |  382 +++++
 encoder/set.h                   |   32 +
 extras/getopt.c                 |  503 ++++++
 extras/getopt.h                 |  179 ++
 extras/stdint.h                 |  186 +++
 testing/checkasm.c              |  347 ++++
 testing/edge-detec.c            | 2733 +++++++++++++++++++++++++++++++
 testing/macroblock-dz.c         | 2266 +++++++++++++++++++++++++
 tools/.cvsignore                |    2 +
 tools/Jamfile                   |    9 +
 tools/avc2avi.c                 |  820 ++++++++++
 tools/x264-rd.sh                |   32 +
 tools/xyuv.c                    |  607 +++++++
 vfw/build/cygwin/Makefile       |  117 ++
 vfw/build/win32/bin/x264vfw.inf |   91 +
 vfw/build/win32/x264vfw.dsp     |  135 ++
 vfw/build/win32/x264vfw.dsw     |   29 +
 vfw/codec.c                     |  276 ++++
 vfw/config.c                    |  443 +++++
 vfw/driverproc.c                |  234 +++
 vfw/resource.h                  |   52 +
 vfw/x264vfw.h                   |  103 ++
 x264.c                          |  558 +++++++
 x264.h                          |  249 +++
 95 files changed, 32275 insertions(+)
 create mode 100644 .cvsignore
 create mode 100644 AUTHORS
 create mode 100644 COPYING
 create mode 100644 Jamfile
 create mode 100644 Makefile
 create mode 100644 Makefile.cygwin
 create mode 100644 TODO
 create mode 100644 build/cygwin/Makefile
 create mode 100644 build/win32/libx264.dsp
 create mode 100644 build/win32/x264.dsp
 create mode 100644 build/win32/x264.dsw
 create mode 100644 core/bs.h
 create mode 100644 core/cabac.c
 create mode 100644 core/cabac.h
 create mode 100644 core/clip1.h
 create mode 100644 core/common.c
 create mode 100644 core/common.h
 create mode 100644 core/cpu.c
 create mode 100644 core/cpu.h
 create mode 100644 core/csp.c
 create mode 100644 core/csp.h
 create mode 100644 core/dct.c
 create mode 100644 core/dct.h
 create mode 100644 core/frame.c
 create mode 100644 core/frame.h
 create mode 100644 core/i386/cpu.asm
 create mode 100644 core/i386/dct-c.c
 create mode 100644 core/i386/dct.asm
 create mode 100644 core/i386/dct.h
 create mode 100644 core/i386/mc-c.c
 create mode 100644 core/i386/mc.asm
 create mode 100644 core/i386/mc.h
 create mode 100644 core/i386/pixel.asm
 create mode 100644 core/i386/pixel.h
 create mode 100644 core/i386/predict.c
 create mode 100644 core/i386/predict.h
 create mode 100644 core/macroblock.c
 create mode 100644 core/macroblock.h
 create mode 100644 core/mc.c
 create mode 100644 core/mc.h
 create mode 100644 core/mdate.c
 create mode 100644 core/pixel.c
 create mode 100644 core/pixel.h
 create mode 100644 core/ppc/mc.c
 create mode 100644 core/ppc/mc.h
 create mode 100644 core/ppc/pixel.c
 create mode 100644 core/ppc/pixel.h
 create mode 100644 core/ppc/ppccommon.h
 create mode 100644 core/predict.c
 create mode 100644 core/predict.h
 create mode 100644 core/set.h
 create mode 100644 core/vlc.h
 create mode 100644 decoder/decoder.c
 create mode 100644 decoder/macroblock.c
 create mode 100644 decoder/macroblock.h
 create mode 100644 decoder/set.c
 create mode 100644 decoder/set.h
 create mode 100644 decoder/vlc.c
 create mode 100644 decoder/vlc.h
 create mode 100644 doc/dct.txt
 create mode 100644 encoder/analyse.c
 create mode 100644 encoder/analyse.h
 create mode 100644 encoder/cabac.c
 create mode 100644 encoder/cavlc.c
 create mode 100644 encoder/encoder.c
 create mode 100644 encoder/macroblock.c
 create mode 100644 encoder/macroblock.h
 create mode 100644 encoder/me.c
 create mode 100644 encoder/me.h
 create mode 100644 encoder/ratecontrol.c
 create mode 100644 encoder/ratecontrol.h
 create mode 100644 encoder/set.c
 create mode 100644 encoder/set.h
 create mode 100644 extras/getopt.c
 create mode 100644 extras/getopt.h
 create mode 100644 extras/stdint.h
 create mode 100644 testing/checkasm.c
 create mode 100644 testing/edge-detec.c
 create mode 100644 testing/macroblock-dz.c
 create mode 100644 tools/.cvsignore
 create mode 100644 tools/Jamfile
 create mode 100644 tools/avc2avi.c
 create mode 100755 tools/x264-rd.sh
 create mode 100644 tools/xyuv.c
 create mode 100644 vfw/build/cygwin/Makefile
 create mode 100644 vfw/build/win32/bin/x264vfw.inf
 create mode 100644 vfw/build/win32/x264vfw.dsp
 create mode 100644 vfw/build/win32/x264vfw.dsw
 create mode 100644 vfw/codec.c
 create mode 100644 vfw/config.c
 create mode 100644 vfw/driverproc.c
 create mode 100644 vfw/resource.h
 create mode 100644 vfw/x264vfw.h
 create mode 100644 x264.c
 create mode 100644 x264.h

diff --git a/.cvsignore b/.cvsignore
new file mode 100644
index 00000000..82e863c8
--- /dev/null
+++ b/.cvsignore
@@ -0,0 +1,3 @@
+.depend
+x264
+checkasm
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000..af06be2a
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,35 @@
+# $Id: AUTHORS,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+# 
+# The format of this file was inspired by the Linux kernel CREDITS file.
+# Authors are listed alphabetically.
+# 
+# The fields are: name (N), email (E), web-address (W), CVS account login (C),
+# PGP key ID and fingerprint (P), description (D), and snail-mail address (S).
+
+N: Laurent Aimar
+E: fenrir AT via.ecp DOT fr
+C: fenrir
+D: Intial import, maintainer
+D: i386 asm (mmx/mmx2)
+S: France
+
+N: Eric Petit
+E: titer AT videolan DOT org
+C: titer
+D: Altivec
+D: BeOS and MacOS X ports.
+S: France
+
+N: Min Chen
+E: chenm001 AT 163 DOT com
+C: chenm001
+D: Win32/VC 6.0 port
+D: gcc asm to nasm conversion
+D: vfw interface
+S: China
+
+N: Justin Clay
+E: justin_clay AT hotmail DOT com
+C: wheatgerm
+D: Inital work on vfw
+S: Nova Scotia, Canada
diff --git a/COPYING b/COPYING
new file mode 100644
index 00000000..d60c31a9
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/Jamfile b/Jamfile
new file mode 100644
index 00000000..3266e836
--- /dev/null
+++ b/Jamfile
@@ -0,0 +1,67 @@
+# $Id: Jamfile,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+#
+
+# Compilers
+CC = gcc ;
+AS = nasm ;
+
+# Flags
+# To profile: -fprofile-arcs
+# Once done : -fbranch-probabilities
+CCFLAGS = -g -Wall -W ;
+
+# Globals defines
+DEFINES = DEBUG __X264__ ;
+
+# Optims
+OPTIM = -O3 -funroll-loops ;
+
+# Headers rep
+HDRS = . core core/i366 decoder encoder ;
+
+SOURCES_C = core/mc.c core/predict.c core/pixel.c core/macroblock.c
+            core/frame.c core/dct.c core/cpu.c core/cabac.c
+            core/common.c core/mdate.c core/csp.c
+            encoder/analyse.c encoder/me.c encoder/ratecontrol.c
+            encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c
+            encoder/encoder.c ;
+
+SOURCES_X86 = core/i386/cpu.asm ;
+SOURCES_MMX = core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c core/i386/dct.asm core/i386/pixel.asm core/i386/mc.asm ;
+
+SOURCES_ALTIVEC = core/ppc/mc.c core/ppc/pixel.c ;
+
+# libx264
+SOURCES_X264 = $(SOURCES_C) ;
+if $(OS) = LINUX
+{
+    DEFINES      += ARCH_X86 HAVE_MMXEXT HAVE_MALLOC_H ;
+    SOURCES_X264 += $(SOURCES_MMX) ;
+    SOURCES_X264 += $(SOURCES_X86) ;
+    ASFLAGS = -f elf ;
+
+    # Don't ask why
+    NOARUPDATE = false ;
+}
+if $(OS) = MACOSX
+{
+    DEFINES      += HAVE_ALTIVEC ;
+    SOURCES_X264 += $(SOURCES_ALTIVEC) ;
+    CCFLAGS      += -faltivec ;
+#    OPTIM        += -falign-loops=16 ;
+}
+Library libx264 : $(SOURCES_X264) ;
+
+# x264
+LINKLIBS += -lm ;
+LinkLibraries x264 : libx264.a ;
+Main x264 : x264.c ;
+
+# checkasm
+LinkLibraries checkasm : libx264.a ;
+Main checkasm : testing/checkasm.c ;
+
+# XXX Do not remove *.o files
+actions quietly updated piecemeal together RmTemps
+{
+}
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..25060f5d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,58 @@
+# Makefile: tuned for i386/MMX system only
+# For ppc append
+#  SRCS: core/ppc/mc.c core/ppc/pixel.c 
+#  Defines: HAVE_ALTIVEC
+#  CFLAGS: -faltivec
+#
+CC=gcc
+CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+
+SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c \
+       x264.c
+
+AS= nasm
+# for linux
+ASFLAGS=-f elf
+# for cygwin
+#ASFLAGS=-f gnuwin32 -DPREFIX
+
+ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm  core/i386/mc.asm
+OBJASM= $(ASMSRC:%.asm=%.o)
+
+OBJS = $(SRCS:%.c=%.o)
+DEP  = depend
+
+default: $(DEP) x264
+
+libx264.a: $(OBJS) $(OBJASM)
+	ar rc libx264.a $(OBJS) $(OBJASM)
+
+x264: libx264.a x264.o
+	$(CC) $(CFLAGS) -o x264 x264.o libx264.a -lm
+
+checkasm: testing/checkasm.c libx264.a
+	$(CC) $(CFLAGS) -o checkasm $< libx264.a -lm
+
+%.o: %.asm
+	$(AS) $(ASFLAGS) -o $@ $<
+
+.depend: $(SRCS) x264.c
+	$(CC) -MM $(CFLAGS) $(SRCS) x264.c 1> .depend
+
+depend: .depend
+ifneq ($(wildcard .depend),)
+include .depend
+endif
+
+clean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
+distclean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
diff --git a/Makefile.cygwin b/Makefile.cygwin
new file mode 100644
index 00000000..b99f8dc4
--- /dev/null
+++ b/Makefile.cygwin
@@ -0,0 +1,52 @@
+# Makefile: tuned for i386/MMX cygwin system only
+#
+CC=gcc
+CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+
+SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c \
+       x264.c
+
+AS= nasm
+#for cygwin
+ASFLAGS=-f win32 -DPREFIX
+
+ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
+OBJASM= $(ASMSRC:%.asm=%.o)
+
+OBJS = $(SRCS:%.c=%.o)
+DEP  = depend
+
+default: $(DEP) x264
+
+libx264.a: $(OBJS) $(OBJASM)
+	ar rc libx264.a $(OBJS) $(OBJASM)
+
+x264: libx264.a x264.o
+	$(CC) $(CFLAGS) -o x264 x264.o libx264.a -lm
+
+checkasm: testing/checkasm.c libx264.a
+	$(CC) $(CFLAGS) -o checkasm $< libx264.a -lm
+
+%.o: %.asm
+	$(AS) $(ASFLAGS) -o $@ $<
+
+.depend: $(SRCS) x264.c
+	$(CC) -MM $(CFLAGS) $(SRCS) x264.c 1> .depend
+
+depend: .depend
+ifneq ($(wildcard .depend),)
+include .depend
+endif
+
+clean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
+distclean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
diff --git a/TODO b/TODO
new file mode 100644
index 00000000..ec1ab53f
--- /dev/null
+++ b/TODO
@@ -0,0 +1,65 @@
+ It is far from complete, anyway :
+
+General:
+--------
+ Encoder:
+ ########
+
+ * CABAC: check if adaptive model is really working. (I didn't see any improvments)
+
+ * Field support : no and I probably won't do it.
+
+ * Slice A/B/C (partion): is there any interest doing it ? (Shouldn't be hard).
+    - extend x264_t
+    - review x264_macroblock_write_cavlc
+
+ * Intra encoding:
+    - in I_4x4 mode, some predict mode aren't tested and not supported :
+    when some pixels  around are unavailble but could  be predicted from
+    others. (see the norm)
+
+ * Inter coding:
+    - D_4x8 D_8x4 and D_4x4 ME P block -> done but too slow.
+    - B_ types.
+    - scene change detection.
+    - long terme ?
+    - ...
+
+ * B frame: B_L0/L1/BI work in cavlc (need more testing).
+    -> need to do all others mb type (first B_DIRECT and B_SKIP)
+    -> cabac
+    -> look at weighted prediction (should give bettter result)
+    -> better analyse algo (as always ;)
+
+ * Speed issue (oprofile is your friend)
+    - mc.c:30% and pixel.c:20% (mc is used by ME)
+    - Motion Estimation -> try better/faster algos.
+    - loop filter
+    - stream writing (bs)
+    - ...
+
+ * Time spent: (test file: 720x576, mmx, mmxext)
+    CAVLC: analyse=73% encode=15% write=4% filter=6%
+    CABAC: analyse=69% encode=16% write=8% filter=5%
+
+ * Limitations:
+    - frame width/height %16 == 0 only.
+
+ * ...
+
+ Decoder:
+ ########
+
+ * Currently decoder/* won't even compile, and anyway is unusable.
+
+ Build:
+ ######
+ * Port gcc inlined asm to nasm file (BUT without any speed loss else
+   that will be rejected).
+
+Coding issue:
+-------------
+ * table : somes are duplicated -> find a solution (easy).
+ * documentations ? (mouaaaarfff ;)
+ * ...
+
diff --git a/build/cygwin/Makefile b/build/cygwin/Makefile
new file mode 100644
index 00000000..d4458cc5
--- /dev/null
+++ b/build/cygwin/Makefile
@@ -0,0 +1,102 @@
+##############################################################################
+#
+# Makefile for lib264.a and x264
+#
+# Author: x264 by Laurent Aimar <fenrir@via.ecp.fr>
+#
+# $Id: Makefile,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+##############################################################################
+
+# Current dir
+DIR_CUR=$(shell pwd)
+
+# Path to src
+DIR_SRC=$(DIR_CUR)/../..
+
+# Sources
+SRC_C= core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c
+
+SRC_ASM= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
+
+# Alias
+RM= rm -rf
+
+##############################################################################
+# CFLAGS
+##############################################################################
+
+# Constants which should not be modified
+# The `mingw-runtime` package is required when building with -mno-cygwin
+CFLAGS += -I$(DIR_SRC)
+CFLAGS += -mno-cygwin
+CFLAGS += -D__X264__ -DARCH_X86 -DHAVE_MMXEXT -D_CYGWIN
+
+# Optional Compiler options
+CFLAGS += -g -Wall -DDEBUG
+CFLAGS += -O3
+CFLAGS += -finline-functions
+CFLAGS += -funroll-loops
+CFLAGS += -ffast-math
+
+
+##############################################################################
+# Compiler flags for linking stage
+##############################################################################
+
+LDFLAGS += -L$(DIR_LIB) -lx264
+
+##############################################################################
+# ASM
+##############################################################################
+AS= nasm
+ASFLAGS= -f gnuwin32 -DPREFIX
+##############################################################################
+# Rules
+##############################################################################
+
+OBJECTS = $(SRC_C:.c=.obj)
+OBJECTS+= $(SRC_ASM:.asm=.obj)
+
+.SUFFIXES: .obj .asm .c
+
+DIR_BUILD= $(DIR_CUR)/bin
+VPATH = $(DIR_SRC):$(DIR_BUILD)
+
+all: libx264.a x264.exe
+
+$(DIR_BUILD):
+	@echo " D: $(DIR_BUILD)"
+	@mkdir -p $(DIR_BUILD)
+
+.asm.obj:
+	@echo " A: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(AS) $(ASFLAGS) -o $(DIR_BUILD)/$@ $<
+
+.c.obj:
+	@echo " C: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(CC) $(CFLAGS) -c -o $(DIR_BUILD)/$@ $<
+
+libx264.a: $(DIR_BUILD) $(OBJECTS)
+	@echo " L: $(@F)"
+	@cd $(DIR_BUILD) && \
+	ar rc libx264.a $(OBJECTS) && \
+	cp -f libx264.a $(DIR_SRC)
+
+x264.exe: $(DIR_BUILD) $(OBJECTS) x264.obj
+	@echo " L: $(@F)"
+	@cd $(DIR_BUILD) && \
+	$(CC) $(CFLAGS) -o x264 x264.obj libx264.a -lm && \
+	cp -f x264.exe $(DIR_SRC)
+
+clean:
+	@echo " Cl: Object files and target lib"
+	@$(RM) $(DIR_BUILD)
+
diff --git a/build/win32/libx264.dsp b/build/win32/libx264.dsp
new file mode 100644
index 00000000..63a4d6a0
--- /dev/null
+++ b/build/win32/libx264.dsp
@@ -0,0 +1,742 @@
+# Microsoft Developer Studio Project File - Name="libx264" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Static Library" 0x0104
+
+CFG=libx264 - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "libx264.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "libx264.mak" CFG="libx264 - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "libx264 - Win32 Release" (based on "Win32 (x86) Static Library")
+!MESSAGE "libx264 - Win32 Debug" (based on "Win32 (x86) Static Library")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "NDEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /D "ARCH_X86" /FD /c
+# SUBTRACT CPP /YX
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo /out:"bin/libx264.lib"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /Zi /Od /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "_DEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /D "ARCH_X86" /FD /GZ /c
+# SUBTRACT CPP /YX
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo /out:"bin/libx264.lib"
+
+!ENDIF 
+
+# Begin Target
+
+# Name "libx264 - Win32 Release"
+# Name "libx264 - Win32 Debug"
+# Begin Group "Enc"
+
+# PROP Default_Filter ".c"
+# Begin Group "enc_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\encoder\analyse.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\macroblock.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\me.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\ratecontrol.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\set.h
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\encoder\analyse.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\cabac.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\cavlc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\encoder.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\me.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\ratecontrol.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\set.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "Core"
+
+# PROP Default_Filter ".c;.h;"
+# Begin Group "core_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\core\bs.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cabac.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\clip1.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\common.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cpu.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\csp.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\dct.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\frame.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\macroblock.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\pixel.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\predict.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\set.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\vlc.h
+# End Source File
+# End Group
+# Begin Group "I386"
+
+# PROP Default_Filter "*.h,*.c,*.asm"
+# Begin Source File
+
+SOURCE=..\..\core\i386\cpu.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\cpu.asm
+InputName=cpu
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\cpu.asm
+InputName=cpu
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE="..\..\core\i386\dct-c.c"
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\dct.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\dct.asm
+InputName=dct
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\dct.asm
+InputName=dct
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\dct.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\pixel.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\pixel.asm
+InputName=pixel
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\pixel.asm
+InputName=pixel
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\pixel.h
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\core\cabac.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\common.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cpu.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\csp.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\dct.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\frame.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mdate.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\pixel.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\predict.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "Dec"
+
+# PROP Default_Filter ".c"
+# Begin Group "dec_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\decoder\macroblock.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\set.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\vlc.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\decoder\decoder.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\set.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\vlc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "extras"
+
+# PROP Default_Filter ".c"
+# Begin Group "extras_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\extras\getopt.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\extras\stdint.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\extras\getopt.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# End Target
+# End Project
diff --git a/build/win32/x264.dsp b/build/win32/x264.dsp
new file mode 100644
index 00000000..9fe8398f
--- /dev/null
+++ b/build/win32/x264.dsp
@@ -0,0 +1,94 @@
+# Microsoft Developer Studio Project File - Name="x264" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=x264 - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "x264.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "x264.mak" CFG="x264 - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "x264 - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "x264 - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "x264 - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "obj/x264_Release"
+# PROP Intermediate_Dir "obj/x264_Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "NDEBUG" /D "_CONSOLE" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"bin/x264.exe"
+
+!ELSEIF  "$(CFG)" == "x264 - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "obj/x264_Debug"
+# PROP Intermediate_Dir "obj/x264_Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /Zi /Od /I "../../core" /I "../../encode" /I "../../decode" /I "../../extras" /I "../.." /D "_DEBUG" /D "_CONSOLE" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"bin/x264.exe" /pdbtype:sept
+
+!ENDIF 
+
+# Begin Target
+
+# Name "x264 - Win32 Release"
+# Name "x264 - Win32 Debug"
+# Begin Source File
+
+SOURCE=..\..\x264.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\x264.h
+# End Source File
+# End Target
+# End Project
diff --git a/build/win32/x264.dsw b/build/win32/x264.dsw
new file mode 100644
index 00000000..8ef22bf7
--- /dev/null
+++ b/build/win32/x264.dsw
@@ -0,0 +1,44 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "libx264"=.\libx264.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Project: "x264"=.\x264.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name libx264
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/core/bs.h b/core/bs.h
new file mode 100644
index 00000000..380799f2
--- /dev/null
+++ b/core/bs.h
@@ -0,0 +1,423 @@
+/*****************************************************************************
+ * bs.h :
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: bs.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifdef _BS_H
+#warning FIXME Multiple inclusion of bs.h
+#else
+#define _BS_H
+
+typedef struct bs_s
+{
+    uint8_t *p_start;
+    uint8_t *p;
+    uint8_t *p_end;
+
+    int     i_left;    /* i_count number of available bits */
+} bs_t;
+
+static inline void bs_init( bs_t *s, void *p_data, int i_data )
+{
+    s->p_start = p_data;
+    s->p       = p_data;
+    s->p_end   = s->p + i_data;
+    s->i_left  = 8;
+}
+static inline int bs_pos( bs_t *s )
+{
+    return( 8 * ( s->p - s->p_start ) + 8 - s->i_left );
+}
+static inline int bs_eof( bs_t *s )
+{
+    return( s->p >= s->p_end ? 1: 0 );
+}
+static inline uint32_t bs_read( bs_t *s, int i_count )
+{
+     static uint32_t i_mask[33] ={0x00,
+                                  0x01,      0x03,      0x07,      0x0f,
+                                  0x1f,      0x3f,      0x7f,      0xff,
+                                  0x1ff,     0x3ff,     0x7ff,     0xfff,
+                                  0x1fff,    0x3fff,    0x7fff,    0xffff,
+                                  0x1ffff,   0x3ffff,   0x7ffff,   0xfffff,
+                                  0x1fffff,  0x3fffff,  0x7fffff,  0xffffff,
+                                  0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff,
+                                  0x1fffffff,0x3fffffff,0x7fffffff,0xffffffff};
+    int      i_shr;
+    uint32_t i_result = 0;
+
+    while( i_count > 0 )
+    {
+        if( s->p >= s->p_end )
+        {
+            break;
+        }
+
+        if( ( i_shr = s->i_left - i_count ) >= 0 )
+        {
+            /* more in the buffer than requested */
+            i_result |= ( *s->p >> i_shr )&i_mask[i_count];
+            s->i_left -= i_count;
+            if( s->i_left == 0 )
+            {
+                s->p++;
+                s->i_left = 8;
+            }
+            return( i_result );
+        }
+        else
+        {
+            /* less in the buffer than requested */
+           i_result |= (*s->p&i_mask[s->i_left]) << -i_shr;
+           i_count  -= s->i_left;
+           s->p++;
+           s->i_left = 8;
+        }
+    }
+
+    return( i_result );
+}
+
+#if 0
+/* Only > i386 */
+static uint32_t bswap32( uint32_t x )
+{
+    asm( "bswap   %0": "=r" (x):"0" (x));
+    return x;
+}
+/* work only for i_count <= 32 - 7 */
+static inline uint32_t bs_read( bs_t *s, int i_count )
+{
+    if( s->p < s->p_end && i_count > 0 )
+    {
+#if 0
+        uint32_t i_cache = ((s->p[0] << 24)+(s->p[1] << 16)+(s->p[2] << 8)+s->p[3]) << (8-s->i_left);
+#else
+        uint32_t i_cache = bswap32( *((uint32_t*)s->p) ) << (8-s->i_left);
+#endif
+        uint32_t i_ret = i_cache >> ( 32 - i_count);
+
+        s->i_left -= i_count;
+#if 0
+        if( s->i_left <= 0 )
+        {
+            int i_skip = (8-s->i_left) >> 3;
+
+            s->p += i_skip;
+
+            s->i_left += i_skip << 3;
+        }
+#else
+        while( s->i_left <= 0 )
+        {
+            s->p++;
+            s->i_left += 8;
+        }
+#endif
+        return i_ret;
+    }
+    return 0;
+}
+
+#endif
+static inline uint32_t bs_read1( bs_t *s )
+{
+
+    if( s->p < s->p_end )
+    {
+        unsigned int i_result;
+
+        s->i_left--;
+        i_result = ( *s->p >> s->i_left )&0x01;
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+        return i_result;
+    }
+
+    return 0;
+}
+static inline uint32_t bs_show( bs_t *s, int i_count )
+{
+#if 0
+    bs_t     s_tmp = *s;
+    return bs_read( &s_tmp, i_count );
+#else
+    if( s->p < s->p_end && i_count > 0 )
+    {
+        uint32_t i_cache = ((s->p[0] << 24)+(s->p[1] << 16)+(s->p[2] << 8)+s->p[3]) << (8-s->i_left);
+        return( i_cache >> ( 32 - i_count) );
+    }
+    return 0;
+#endif
+}
+
+/* TODO optimize */
+static inline void bs_skip( bs_t *s, int i_count )
+{
+    s->i_left -= i_count;
+
+    while( s->i_left <= 0 )
+    {
+        s->p++;
+        s->i_left += 8;
+    }
+}
+
+
+static inline int bs_read_ue( bs_t *s )
+{
+    int i = 0;
+
+    while( bs_read1( s ) == 0 && s->p < s->p_end && i < 32 )
+    {
+        i++;
+    }
+    return( ( 1 << i) - 1 + bs_read( s, i ) );
+}
+static inline int bs_read_se( bs_t *s )
+{
+    int val = bs_read_ue( s );
+
+    return val&0x01 ? (val+1)/2 : -(val/2);
+}
+
+static inline int bs_read_te( bs_t *s, int x )
+{
+    if( x == 1 )
+    {
+        return 1 - bs_read1( s );
+    }
+    else if( x > 1 )
+    {
+        return bs_read_ue( s );
+    }
+    return 0;
+}
+
+/* TODO optimize (write x bits at once) */
+static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
+{
+    while( i_count > 0 )
+    {
+        if( s->p >= s->p_end )
+        {
+            break;
+        }
+
+        i_count--;
+
+        if( ( i_bits >> i_count )&0x01 )
+        {
+            *s->p |= 1 << ( s->i_left - 1 );
+        }
+        else
+        {
+            *s->p &= ~( 1 << ( s->i_left - 1 ) );
+        }
+        s->i_left--;
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+    }
+}
+
+static inline void bs_write1( bs_t *s, uint32_t i_bits )
+{
+    if( s->p < s->p_end )
+    {
+        s->i_left--;
+
+        if( i_bits&0x01 )
+        {
+            *s->p |= 1 << s->i_left;
+        }
+        else
+        {
+            *s->p &= ~( 1 << s->i_left );
+        }
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+    }
+}
+
+static inline void bs_align( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        s->i_left = 8;
+        s->p++;
+    }
+}
+static inline void bs_align_0( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 0 );
+    }
+}
+static inline void bs_align_1( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 1 );
+    }
+}
+
+
+
+/* golomb functions */
+
+static inline void bs_write_ue( bs_t *s, unsigned int val )
+{
+    int i_size = 0;
+    static const int i_size0_255[256] =
+    {
+        1,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+    };
+
+    if( val == 0 )
+    {
+        bs_write( s, 1, 1 );
+    }
+    else
+    {
+        unsigned int tmp = ++val;
+
+        if( tmp >= 0x00010000 )
+        {
+            i_size += 16;
+            tmp >>= 16;
+        }
+        if( tmp >= 0x100 )
+        {
+            i_size += 8;
+            tmp >>= 8;
+        }
+        i_size += i_size0_255[tmp];
+
+        bs_write( s, 2 * i_size - 1, val );
+    }
+}
+
+static inline void bs_write_se( bs_t *s, int val )
+{
+    bs_write_ue( s, val <= 0 ? -val * 2 : val * 2 - 1);
+}
+
+static inline void bs_write_te( bs_t *s, int x, int val )
+{
+    if( x == 1 )
+    {
+        bs_write( s, 1, ~val );
+    }
+    else if( x > 1 )
+    {
+        bs_write_ue( s, val );
+    }
+}
+
+static inline void bs_rbsp_trailing( bs_t *s )
+{
+    bs_write( s, 1, 1 );
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 0x00 );
+    }
+}
+
+static inline int bs_size_ue( unsigned int val )
+{
+    static const int i_size0_254[255] =
+    {
+        1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
+        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+        11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
+        11,11,11,11,11,11,11,11,11,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+    };
+
+    if( val < 255 )
+    {
+        return i_size0_254[val];
+    }
+    else
+    {
+        int i_size = 0;
+
+        val++;
+
+        if( val >= 0x10000 )
+        {
+            i_size += 32;
+            val = (val >> 16) - 1;
+        }
+        if( val >= 0x100 )
+        {
+            i_size += 16;
+            val = (val >> 8) - 1;
+        }
+        return i_size0_254[val] + i_size;
+    }
+}
+
+static inline int bs_size_se( int val )
+{
+    return bs_size_ue( val <= 0 ? -val * 2 : val * 2 - 1);
+}
+
+static inline int bs_size_te( int x, int val )
+{
+    if( x == 1 )
+    {
+        return 1;
+    }
+    else if( x > 1 )
+    {
+        return bs_size_ue( val );
+    }
+    return 0;
+}
+
+
+
+#endif
diff --git a/core/cabac.c b/core/cabac.c
new file mode 100644
index 00000000..51a06e2c
--- /dev/null
+++ b/core/cabac.c
@@ -0,0 +1,1044 @@
+/*****************************************************************************
+ * cabac.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cabac.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "common.h"
+
+//#define TRACE 1
+
+/* Debugging purpose ONLY !!!! */
+#ifdef TRACE
+static int binCount = 0;
+#endif
+
+
+static const int x264_cabac_context_init_I[399][2] =
+{
+    /* 0 - 10 */
+    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
+    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
+    { -6,  53 }, { -1, 54 },  {  7,  51 },
+
+    /* 11 - 23 unsused for I */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },
+
+    /* 24- 39 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+
+    /* 40 - 53 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 54 - 59 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 60 - 69 */
+    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+    { 13, 41 },  { 3, 62 },
+
+    /* 70 -> 87 */
+    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
+    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
+    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
+    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
+    { -12, 115 },{ -16, 122 },
+
+    /* 88 -> 104 */
+    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
+    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
+    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
+    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
+    { -22, 125 },
+
+    /* 105 -> 135 */
+    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
+    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
+    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
+    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
+    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
+    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
+    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
+    { 14, 62 },  { -13, 108 },{ -15, 100 },
+
+    /* 136 -> 165 */
+    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
+    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
+    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
+    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
+    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
+    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
+    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
+    { 0, 62 },   { 12, 72 },
+
+    /* 166 -> 196 */
+    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
+    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
+    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
+    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
+    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
+    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
+    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
+    { 0, 89 },   { 26, -19 }, { 22, -17 },
+
+    /* 197 -> 226 */
+    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
+    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
+    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
+    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
+    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
+    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
+    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
+    { 12, 68 },  { 2, 97 },
+
+    /* 227 -> 251 */
+    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
+    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
+    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
+    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
+    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
+    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
+    { -4, 65 },
+
+    /* 252 -> 275 */
+    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
+    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
+    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
+    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
+    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
+    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
+
+    /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
+    { 0, 0 },
+
+    /* 277 -> 307 */
+    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
+    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
+    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
+    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
+    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
+    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
+    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
+    { 9, 64 },   { -12, 104 },{ -11, 97 },
+
+    /* 308 -> 337 */
+    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
+    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
+    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
+    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
+    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
+    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
+    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
+    { 5, 64 },   { 12, 70 },
+
+    /* 338 -> 368 */
+    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
+    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
+    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
+    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
+    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
+    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
+    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
+    { -12, 109 },{ 36, -35 }, { 36, -34 },
+
+    /* 369 -> 398 */
+    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
+    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
+    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
+    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
+    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
+    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
+    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
+    { 29, 39 },  { 19, 66 }
+};
+
+static const int x264_cabac_context_init_PB[3][399][2] =
+{
+    /* i_cabac_init_idc == 0 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
+        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
+        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
+        {  17,  50 },
+
+        /* 24 - 39 */
+        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
+        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
+        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
+        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
+
+        /* 40 - 53 */
+        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
+        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
+        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
+        {  -3,  81 }, {   0,  88 },
+
+        /* 54 - 59 */
+        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
+        {  -7,  72 }, {   1,  58 },
+
+        /* 60 - 69 */
+        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
+        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
+        {  13,  41 }, {   3,  62 },
+
+        /* 70 - 87 */
+        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
+        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
+        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
+        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
+        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
+        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
+        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
+        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
+        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
+
+        /* 105 -> 165 */
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
+        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
+        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
+        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
+        {   9,  69 },
+
+        /* 166 - 226 */
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
+        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
+        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
+        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
+        {  -9, 108 },
+
+        /* 227 - 275 */
+        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
+        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
+        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
+        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
+        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
+        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
+        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
+        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
+        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
+        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
+        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
+        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
+        {  -8,  85 },
+
+        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
+        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
+        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
+        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
+        {  26,  43 },
+
+        /* 338 - 398 */
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
+        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
+        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
+        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
+        {  11,  86 },
+
+
+    },
+
+    /* i_cabac_init_idc == 1 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
+        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
+        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
+        {  10,  54 },
+
+        /* 24 - 39 */
+        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
+        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
+        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
+        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
+
+        /* 40 - 53 */
+        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
+        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
+        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
+        {  -7,  86 },{  -5,  95 },
+
+        /* 54 - 59 */
+        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
+        {  -5,  72 },{   0,  61 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
+        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
+        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
+        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
+        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
+        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
+        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
+        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
+        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
+
+        /* 105 -> 165 */
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
+        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
+        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
+        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
+        {   0,  89 },
+
+        /* 166 - 226 */
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
+        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
+        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
+        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
+        { -10, 116 },
+
+        /* 227 - 275 */
+        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
+        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
+        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
+        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
+        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
+        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
+        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
+        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
+        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
+        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
+        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
+        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
+        {  -4,  78 },
+
+        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
+        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
+        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
+        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
+        {  18,  50 },
+
+        /* 338 - 398 */
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
+        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
+        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
+        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
+        {  11,  83 },
+
+    },
+
+    /* i_cabac_init_idc == 2 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
+        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
+        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
+        {  14,  57 },
+
+        /* 24 - 39 */
+        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
+        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
+        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
+        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
+
+        /* 40 - 53 */
+        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
+        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
+        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
+        {  -3,  90 },{  -1,  101 },
+
+        /* 54 - 59 */
+        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
+        {  -7,  50 },{   1,  60 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
+        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
+        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
+        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
+        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
+        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
+        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
+        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
+        {   3,  68 }, {  -8,  71 }, { -13,  98 },
+
+        /* 105 -> 165 */
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
+        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
+        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
+        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
+        { -22, 127 },
+
+        /* 166 - 226 */
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
+        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
+        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
+        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
+        { -24, 127 },
+
+        /* 227 - 275 */
+        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
+        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
+        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
+        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
+        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
+        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
+        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
+        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
+        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
+        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
+        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
+        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
+        { -10,  87 },
+
+        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
+        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
+        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
+        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
+        {  25,  42 },
+
+        /* 338 - 398 */
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
+        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
+        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
+        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
+        {  25,  61 },
+    }
+};
+
+static const int x264_cabac_range_lps[64][4] =
+{
+    { 128, 176, 208, 240 }, { 128, 167, 197, 227 }, { 128, 158, 187, 216 },
+    { 123, 150, 178, 205 }, { 116, 142, 169, 195 }, { 111, 135, 160, 185 },
+    { 105, 128, 152, 175 }, { 100, 122, 144, 166 }, {  95, 116, 137, 158 },
+    {  90, 110, 130, 150 }, {  85, 104, 123, 142 }, {  81,  99, 117, 135 },
+    {  77,  94, 111, 128 }, {  73,  89, 105, 122 }, {  69,  85, 100, 116 },
+    {  66,  80,  95, 110 }, {  62,  76,  90, 104 }, {  59,  72,  86,  99 },
+    {  56,  69,  81,  94 }, {  53,  65,  77,  89 }, {  51,  62,  73,  85 },
+    {  48,  59,  69,  80 }, {  46,  56,  66,  76 }, {  43,  53,  63,  72 },
+    {  41,  50,  59,  69 }, {  39,  48,  56,  65 }, {  37,  45,  54,  62 },
+    {  35,  43,  51,  59 }, {  33,  41,  48,  56 }, {  32,  39,  46,  53 },
+    {  30,  37,  43,  50 }, {  29,  35,  41,  48 }, {  27,  33,  39,  45 },
+    {  26,  31,  37,  43 }, {  24,  30,  35,  41 }, {  23,  28,  33,  39 },
+    {  22,  27,  32,  37 }, {  21,  26,  30,  35 }, {  20,  24,  29,  33 },
+    {  19,  23,  27,  31 }, {  18,  22,  26,  30 }, {  17,  21,  25,  28 },
+    {  16,  20,  23,  27 }, {  15,  19,  22,  25 }, {  14,  18,  21,  24 },
+    {  14,  17,  20,  23 }, {  13,  16,  19,  22 }, {  12,  15,  18,  21 },
+    {  12,  14,  17,  20 }, {  11,  14,  16,  19 }, {  11,  13,  15,  18 },
+    {  10,  12,  15,  17 }, {  10,  12,  14,  16 }, {   9,  11,  13,  15 },
+    {   9,  11,  12,  14 }, {   8,  10,  12,  14 }, {   8,   9,  11,  13 },
+    {   7,   9,  11,  12 }, {   7,   9,  10,  12 }, {   7,   8,  10,  11 },
+    {   6,   8,   9,  11 }, {   6,   7,   9,  10 }, {   6,   7,   8,   9 },
+    {   2,   2,   2,   2 },
+};
+
+static const int x264_transition_lps[64] =
+{
+     0, 0, 1, 2, 2, 4, 4, 5, 6, 7, 8, 9, 9,11,11,12,
+    13,13,15,15,16,16,18,18,19,19,21,21,22,22,23,24,
+    24,25,26,26,27,27,28,29,29,30,30,30,31,32,32,33,
+    33,33,34,34,35,35,35,36,36,36,37,37,37,38,38,63
+};
+static const int x264_transition_mps[64] =
+{
+     1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,
+    17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,
+    33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,
+    49,50,51,52,53,54,55,56,57,58,59,60,61,62,62,63,
+};
+
+#define FIX8(f) ((int)(f*(1<<8)))
+static int x264_cabac_probability[128] =
+{
+    FIX8(0.9812), FIX8(0.9802), FIX8(0.9792), FIX8(0.9781),
+    FIX8(0.9769), FIX8(0.9757), FIX8(0.9744), FIX8(0.9730),
+    FIX8(0.9716), FIX8(0.9700), FIX8(0.9684), FIX8(0.9667),
+    FIX8(0.9650), FIX8(0.9631), FIX8(0.9611), FIX8(0.9590),
+    FIX8(0.9568), FIX8(0.9545), FIX8(0.9521), FIX8(0.9495),
+    FIX8(0.9468), FIX8(0.9440), FIX8(0.9410), FIX8(0.9378),
+    FIX8(0.9345), FIX8(0.9310), FIX8(0.9273), FIX8(0.9234),
+    FIX8(0.9193), FIX8(0.9150), FIX8(0.9105), FIX8(0.9057),
+    FIX8(0.9006), FIX8(0.8953), FIX8(0.8897), FIX8(0.8838),
+    FIX8(0.8776), FIX8(0.8710), FIX8(0.8641), FIX8(0.8569),
+    FIX8(0.8492), FIX8(0.8411), FIX8(0.8326), FIX8(0.8237),
+    FIX8(0.8143), FIX8(0.8043), FIX8(0.7938), FIX8(0.7828),
+    FIX8(0.7712), FIX8(0.7590), FIX8(0.7461), FIX8(0.7325),
+    FIX8(0.7182), FIX8(0.7031), FIX8(0.6872), FIX8(0.6705),
+    FIX8(0.6528), FIX8(0.6343), FIX8(0.6147), FIX8(0.5941),
+    FIX8(0.5724), FIX8(0.5495), FIX8(0.5254), FIX8(0.5000),
+    FIX8(0.5000), FIX8(0.4746), FIX8(0.4505), FIX8(0.4276),
+    FIX8(0.4059), FIX8(0.3853), FIX8(0.3657), FIX8(0.3472),
+    FIX8(0.3295), FIX8(0.3128), FIX8(0.2969), FIX8(0.2818),
+    FIX8(0.2675), FIX8(0.2539), FIX8(0.2410), FIX8(0.2288),
+    FIX8(0.2172), FIX8(0.2062), FIX8(0.1957), FIX8(0.1857),
+    FIX8(0.1763), FIX8(0.1674), FIX8(0.1589), FIX8(0.1508),
+    FIX8(0.1431), FIX8(0.1359), FIX8(0.1290), FIX8(0.1224),
+    FIX8(0.1162), FIX8(0.1103), FIX8(0.1047), FIX8(0.0994),
+    FIX8(0.0943), FIX8(0.0895), FIX8(0.0850), FIX8(0.0807),
+    FIX8(0.0766), FIX8(0.0727), FIX8(0.0690), FIX8(0.0655),
+    FIX8(0.0622), FIX8(0.0590), FIX8(0.0560), FIX8(0.0532),
+    FIX8(0.0505), FIX8(0.0479), FIX8(0.0455), FIX8(0.0432),
+    FIX8(0.0410), FIX8(0.0389), FIX8(0.0369), FIX8(0.0350),
+    FIX8(0.0333), FIX8(0.0316), FIX8(0.0300), FIX8(0.0284),
+    FIX8(0.0270), FIX8(0.0256), FIX8(0.0243), FIX8(0.0231),
+    FIX8(0.0219), FIX8(0.0208), FIX8(0.0198), FIX8(0.0187)
+};
+/* -ln2(probability) */
+static int x264_cabac_entropy[128] =
+{
+    FIX8(0.0273), FIX8(0.0288), FIX8(0.0303), FIX8(0.0320),
+    FIX8(0.0337), FIX8(0.0355), FIX8(0.0375), FIX8(0.0395),
+    FIX8(0.0416), FIX8(0.0439), FIX8(0.0463), FIX8(0.0488),
+    FIX8(0.0515), FIX8(0.0543), FIX8(0.0572), FIX8(0.0604),
+    FIX8(0.0637), FIX8(0.0671), FIX8(0.0708), FIX8(0.0747),
+    FIX8(0.0788), FIX8(0.0832), FIX8(0.0878), FIX8(0.0926),
+    FIX8(0.0977), FIX8(0.1032), FIX8(0.1089), FIX8(0.1149),
+    FIX8(0.1214), FIX8(0.1282), FIX8(0.1353), FIX8(0.1429),
+    FIX8(0.1510), FIX8(0.1596), FIX8(0.1686), FIX8(0.1782),
+    FIX8(0.1884), FIX8(0.1992), FIX8(0.2107), FIX8(0.2229),
+    FIX8(0.2358), FIX8(0.2496), FIX8(0.2642), FIX8(0.2798),
+    FIX8(0.2964), FIX8(0.3142), FIX8(0.3331), FIX8(0.3532),
+    FIX8(0.3748), FIX8(0.3979), FIX8(0.4226), FIX8(0.4491),
+    FIX8(0.4776), FIX8(0.5082), FIX8(0.5412), FIX8(0.5768),
+    FIX8(0.6152), FIX8(0.6568), FIX8(0.7020), FIX8(0.7513),
+    FIX8(0.8050), FIX8(0.8638), FIX8(0.9285), FIX8(1.0000),
+    FIX8(1.0000), FIX8(1.0752), FIX8(1.1504), FIX8(1.2256),
+    FIX8(1.3008), FIX8(1.3759), FIX8(1.4511), FIX8(1.5263),
+    FIX8(1.6015), FIX8(1.6767), FIX8(1.7519), FIX8(1.8271),
+    FIX8(1.9023), FIX8(1.9775), FIX8(2.0527), FIX8(2.1278),
+    FIX8(2.2030), FIX8(2.2782), FIX8(2.3534), FIX8(2.4286),
+    FIX8(2.5038), FIX8(2.5790), FIX8(2.6542), FIX8(2.7294),
+    FIX8(2.8046), FIX8(2.8797), FIX8(2.9549), FIX8(3.0301),
+    FIX8(3.1053), FIX8(3.1805), FIX8(3.2557), FIX8(3.3309),
+    FIX8(3.4061), FIX8(3.4813), FIX8(3.5565), FIX8(3.6316),
+    FIX8(3.7068), FIX8(3.7820), FIX8(3.8572), FIX8(3.9324),
+    FIX8(4.0076), FIX8(4.0828), FIX8(4.1580), FIX8(4.2332),
+    FIX8(4.3083), FIX8(4.3836), FIX8(4.4588), FIX8(4.5339),
+    FIX8(4.6091), FIX8(4.6843), FIX8(4.7595), FIX8(4.8347),
+    FIX8(4.9099), FIX8(4.9851), FIX8(5.0602), FIX8(5.1354),
+    FIX8(5.2106), FIX8(5.2859), FIX8(5.3610), FIX8(5.4362),
+    FIX8(5.5114), FIX8(5.5866), FIX8(5.6618), FIX8(5.7370)
+};
+
+#undef FIX8
+
+
+/*****************************************************************************
+ *
+ *****************************************************************************/
+void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
+{
+    const int (*cabac_context_init)[399][2];
+    int i;
+
+    if( i_slice_type == SLICE_TYPE_I )
+    {
+        cabac_context_init = &x264_cabac_context_init_I;
+    }
+    else
+    {
+        cabac_context_init = &x264_cabac_context_init_PB[i_model];
+    }
+
+    for( i = 0; i < 399; i++ )
+    {
+        int i_pre_state;
+
+        i_pre_state = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 );
+        if( i_pre_state <= 63 )
+        {
+            cb->ctxstate[i].i_state = 63 - i_pre_state;
+            cb->ctxstate[i].i_mps = 0;
+        }
+        else
+        {
+            cb->ctxstate[i].i_state = i_pre_state - 64;
+            cb->ctxstate[i].i_mps = 1;
+        }
+        cb->ctxstate[i].i_count = 0;
+    }
+}
+
+/*****************************************************************************
+ *
+ *****************************************************************************/
+void x264_cabac_decode_init( x264_cabac_t *cb, bs_t *s )
+{
+    cb->i_range = 0x01fe;
+    cb->i_low   = bs_read( s, 9 );
+    cb->s       = s;
+}
+
+static inline void x264_cabac_decode_renorm( x264_cabac_t *cb )
+{
+    while( cb->i_range < 0x0100 )
+    {
+        cb->i_range <<= 1;
+        cb->i_low   = ( cb->i_low << 1 )|bs_read( cb->s, 1 );
+    }
+}
+
+int  x264_cabac_decode_decision( x264_cabac_t *cb, int i_ctx )
+{
+    int i_state = cb->ctxstate[i_ctx].i_state;
+    int i_mps   = cb->ctxstate[i_ctx].i_mps;
+
+    int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)&0x03];
+
+    int val;
+
+    cb->i_range -= i_range_lps;
+
+    if( cb->i_low >= cb->i_range )
+    {
+        val = 1 - i_mps;
+
+        cb->i_low -= cb->i_range;
+        cb->i_range= i_range_lps;
+
+        if( i_state == 0 )
+        {
+            cb->ctxstate[i_ctx].i_mps = 1 - i_mps;
+        }
+        cb->ctxstate[i_ctx].i_state = x264_transition_lps[i_state];
+    }
+    else
+    {
+        val = i_mps;
+
+        cb->ctxstate[i_ctx].i_state = x264_transition_mps[i_state];
+    }
+
+    x264_cabac_decode_renorm( cb );
+
+    return val;
+}
+int  x264_cabac_decode_bypass( x264_cabac_t *cb )
+{
+    cb->i_low = (cb->i_low << 1)|bs_read( cb->s, 1 );
+
+    if( cb->i_low >= cb->i_range )
+    {
+        cb->i_low -= cb->i_range;
+        return 1;
+    }
+    return 0;
+}
+int  x264_cabac_decode_terminal( x264_cabac_t *cb )
+{
+    if( cb->i_low >= cb->i_range - 2 )
+    {
+        return 1;
+    }
+
+    cb->i_range -= 2;
+    x264_cabac_decode_renorm( cb );
+    return 0;
+}
+
+
+/*****************************************************************************
+ *
+ *****************************************************************************/
+void x264_cabac_model_init( x264_cabac_t *cb )
+{
+    int i;
+
+    for( i = 0; i < 3; i++ )
+    {
+        cb->slice[i].i_model = 0;
+        cb->slice[i].i_cost  = -1;
+    }
+}
+
+int  x264_cabac_model_get ( x264_cabac_t *cb, int i_slice_type )
+{
+    return cb->slice[i_slice_type].i_model;
+}
+
+void x264_cabac_model_update( x264_cabac_t *cb, int i_slice_type, int i_qp )
+{
+    int i;
+
+    if( i_slice_type == SLICE_TYPE_I )
+    {
+        return;
+    }
+    cb->slice[i_slice_type].i_cost = -1;
+
+    for( i = 0; i < 3; i++ )
+    {
+        int i_ctx;
+        int i_cost;
+
+        i_cost = 0; /* fix8 */
+
+        for( i_ctx = 0; i_ctx < 399; i_ctx++ )
+        {
+            int i_weight;
+            int i_model_state;
+            int i_ctx_state;
+
+            i_weight = X264_MIN( (1<<8), (cb->ctxstate[i_ctx].i_count<<8) / 32 );
+            i_model_state = x264_clip3( ((x264_cabac_context_init_PB[i][i_ctx][0] * i_qp)>>4) +
+                                          x264_cabac_context_init_PB[i][i_ctx][1], 0, 127 );
+            i_ctx_state   = cb->ctxstate[i_ctx].i_mps ? 64 + cb->ctxstate[i_ctx].i_state : 63 - cb->ctxstate[i_ctx].i_state;
+
+            i_cost += (i_weight * (( x264_cabac_probability[      i_ctx_state] * x264_cabac_entropy[      i_model_state] +
+                                     x264_cabac_probability[127 - i_ctx_state] * x264_cabac_entropy[127 - i_model_state] ) >> 8))>>8;
+        }
+
+        if( cb->slice[i_slice_type].i_cost == -1 || cb->slice[i_slice_type].i_cost > i_cost )
+        {
+            cb->slice[i_slice_type].i_model= i;
+            cb->slice[i_slice_type].i_cost = i_cost;
+        }
+    }
+}
+
+void x264_cabac_encode_init( x264_cabac_t *cb, bs_t *s )
+{
+    cb->i_low   = 0;
+    cb->i_range = 0x01FE;
+    cb->b_first_bit= 1;
+    cb->i_bits_outstanding = 0;
+    cb->i_sym_cnt = 0;
+    cb->s = s;
+}
+
+static inline void x264_cabac_putbit( x264_cabac_t *cb, int b )
+{
+    if( cb->b_first_bit )
+    {
+        cb->b_first_bit = 0;
+    }
+    else
+    {
+        bs_write1( cb->s, b );
+    }
+
+    while( cb->i_bits_outstanding > 0 )
+    {
+        bs_write1( cb->s, 1 - b );
+        cb->i_bits_outstanding--;
+    }
+}
+
+static inline void x264_cabac_encode_renorm( x264_cabac_t *cb )
+{
+    /* RenormE */
+    while( cb->i_range < 0x0100 )
+    {
+        if( cb->i_low < 0x100 )
+        {
+            x264_cabac_putbit( cb, 0 );
+        }
+        else
+        {
+            if( cb->i_low >= 0x200 )
+            {
+                cb->i_low -= 0x200;
+                x264_cabac_putbit( cb, 1 );
+            }
+            else
+            {
+                cb->i_low -= 0x100;
+                cb->i_bits_outstanding++;
+            }
+        }
+
+        cb->i_range <<= 1;
+        cb->i_low   <<= 1;
+    }
+}
+
+void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b )
+{
+    int i_state = cb->ctxstate[i_ctx].i_state;
+    int i_mps   = cb->ctxstate[i_ctx].i_mps;
+
+    int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)&0x03];
+
+#ifdef TRACE
+    if( binCount >= 0 )
+    {
+        fprintf( stderr, "%d  ctx=%d b=%d\n", binCount, i_ctx, b );
+    }
+    fprintf( stderr, "%d  0x%04x  %d  %d\n", binCount++, cb->i_range, i_state, i_mps );
+#endif
+
+    cb->i_range -= i_range_lps;
+
+    if( b != i_mps )
+    {
+        cb->i_low += cb->i_range;
+        cb->i_range = i_range_lps;
+
+        if( i_state == 0 )
+        {
+            cb->ctxstate[i_ctx].i_mps = 1 - i_mps;
+        }
+        cb->ctxstate[i_ctx].i_state = x264_transition_lps[i_state];
+    }
+    else
+    {
+        cb->ctxstate[i_ctx].i_state = x264_transition_mps[i_state];
+    }
+    cb->ctxstate[i_ctx].i_count++;
+
+    x264_cabac_encode_renorm( cb );
+
+    cb->i_sym_cnt++;
+}
+
+void x264_cabac_encode_bypass( x264_cabac_t *cb, int b )
+{
+#ifdef TRACE
+    fprintf( stderr, "%d  0x%04x\n", binCount++, cb->i_range );
+#endif
+
+    cb->i_low <<= 1;
+    if( b )
+    {
+        cb->i_low += cb->i_range;
+    }
+
+    if( cb->i_low >= 0x400 )
+    {
+        x264_cabac_putbit( cb, 1 );
+        cb->i_low -= 0x400;
+    }
+    else
+    {
+        if( cb->i_low < 0x200 )
+        {
+            x264_cabac_putbit( cb, 0 );
+        }
+        else
+        {
+            cb->i_low -= 0x200;
+            cb->i_bits_outstanding++;
+        }
+    }
+    cb->i_sym_cnt++;
+}
+
+void x264_cabac_encode_terminal( x264_cabac_t *cb, int b )
+{
+#ifdef TRACE
+    fprintf( stderr, "%d  0x%04x\n", binCount++, cb->i_range );
+#endif
+
+    cb->i_range -= 2;
+    if( b )
+    {
+        cb->i_low += cb->i_range;
+        cb->i_range = 2;
+    }
+    x264_cabac_encode_renorm( cb );
+
+    cb->i_sym_cnt++;
+}
+
+void x264_cabac_encode_flush( x264_cabac_t *cb )
+{
+    x264_cabac_putbit( cb, (cb->i_low >> 9)&0x01 );
+    bs_write1( cb->s, (cb->i_low >> 8)&0x01 );
+
+    /* check that */
+    bs_write1( cb->s, 0x01 );
+    bs_align_0( cb->s );
+}
+
diff --git a/core/cabac.h b/core/cabac.h
new file mode 100644
index 00000000..3051789d
--- /dev/null
+++ b/core/cabac.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+ * cabac.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cabac.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CABAC_H
+#define _CABAC_H 1
+
+typedef struct
+{
+    /* model */
+    struct
+    {
+        int i_model;
+        int i_cost;
+    } slice[3];
+
+    /* context */
+    struct
+    {
+        int i_state;
+        int i_mps;
+        int i_count;
+    } ctxstate[399];
+
+    /* state */
+    int i_low;
+    int i_range;
+
+    int i_sym_cnt;
+
+    /* bit stream */
+    int b_first_bit;
+    int i_bits_outstanding;
+    bs_t *s;
+
+} x264_cabac_t;
+
+/* encoder/decoder: init the contexts given i_slice_type, the quantif and the model */
+void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
+
+/* decoder only: */
+void x264_cabac_decode_init    ( x264_cabac_t *cb, bs_t *s );
+int  x264_cabac_decode_decision( x264_cabac_t *cb, int i_ctx_idx );
+int  x264_cabac_decode_bypass  ( x264_cabac_t *cb );
+int  x264_cabac_decode_terminal( x264_cabac_t *cb );
+
+/* encoder only: adaptive model init */
+void x264_cabac_model_init( x264_cabac_t *cb );
+int  x264_cabac_model_get ( x264_cabac_t *cb, int i_slice_type );
+void x264_cabac_model_update( x264_cabac_t *cb, int i_slice_type, int i_qp );
+/* encoder only: */
+void x264_cabac_encode_init ( x264_cabac_t *cb, bs_t *s );
+void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx_idx, int b );
+void x264_cabac_encode_bypass( x264_cabac_t *cb, int b );
+void x264_cabac_encode_terminal( x264_cabac_t *cb, int b );
+void x264_cabac_encode_flush( x264_cabac_t *cb );
+
+
+#endif
diff --git a/core/clip1.h b/core/clip1.h
new file mode 100644
index 00000000..0e8adc21
--- /dev/null
+++ b/core/clip1.h
@@ -0,0 +1,71 @@
+/*****************************************************************************
+ * clip1.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: clip1.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CLIP1_H
+#define _CLIP1_H 1
+
+/* Clip1 table
+ * XXX : only for tap filter.
+ *
+ * With tap filter (( 1, -5, 20, 20, -5, 1 ) + 16 )/ 32
+ * -> (-2*5 * 255+16)/32 <= out <= (2*1*255 + 2*20*255+16)/32
+ * -> -80 <= out <= 335
+ * So we need a table of 80+335+1 = 416 entries
+ */
+
+static const uint8_t x264_mc_clip1_table[80+1+335] =
+{
+    /* -80 -> -1 */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,
+    /* 0 -> 255 */
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+    36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+    54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+    72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+    90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,101,102,103,104,105,106,107,
+    108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
+    126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
+    144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
+    162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
+    180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
+    198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
+    216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
+    234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
+    252,253,254,255,
+    /* 256 -> 340 */
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,
+};
+
+static inline uint8_t x264_mc_clip1( int x )
+{
+    return x264_mc_clip1_table[x+80];
+}
+
+#endif
diff --git a/core/common.c b/core/common.c
new file mode 100644
index 00000000..b44d9cd6
--- /dev/null
+++ b/core/common.c
@@ -0,0 +1,300 @@
+/*****************************************************************************
+ * common.c: h264 library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: common.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include "common.h"
+#include "cpu.h"
+
+/****************************************************************************
+ * x264_param_default:
+ ****************************************************************************/
+void    x264_param_default( x264_param_t *param )
+{
+    /* */
+    memset( param, 0, sizeof( x264_param_t ) );
+
+    /* CPU autodetect */
+    param->cpu = x264_cpu_detect();
+    fprintf( stderr, "x264: cpu capabilities: %s%s%s%s%s%s\n",
+             param->cpu&X264_CPU_MMX ? "MMX " : "",
+             param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
+             param->cpu&X264_CPU_SSE ? "SSE " : "",
+             param->cpu&X264_CPU_SSE2 ? "SSE2 " : "",
+             param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
+             param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" );
+
+
+    /* Video properties */
+    param->i_csp           = X264_CSP_I420;
+    param->i_width         = 0;
+    param->i_height        = 0;
+    param->vui.i_sar_width = 0;
+    param->vui.i_sar_height= 0;
+    param->f_fps           = 25.0;
+
+    /* Encoder parameters */
+    param->i_frame_reference = 1;
+    param->i_idrframe = 2;
+    param->i_iframe = 60;
+    param->i_bframe = 0;
+
+    param->b_deblocking_filter = 1;
+    param->i_deblocking_filter_alphac0 = 0;
+    param->i_deblocking_filter_beta = 0;
+
+    param->b_cabac = 0;
+    param->i_cabac_init_idc = -1;
+
+    param->i_bitrate = 3000;
+    param->i_qp_constant = 26;
+
+    param->analyse.intra = X264_ANALYSE_I4x4;
+    param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16;
+}
+
+/****************************************************************************
+ * x264_picture_alloc:
+ ****************************************************************************/
+void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+{
+    pic->i_type = X264_TYPE_AUTO;
+    pic->i_qpplus1 = 0;
+    pic->img.i_csp = i_csp;
+    switch( i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+        case X264_CSP_YV12:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height / 2 );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width / 2;
+            pic->img.i_stride[2] = i_width / 2;
+            break;
+
+        case X264_CSP_I422:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 2 * i_width * i_height );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 2;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width / 2;
+            pic->img.i_stride[2] = i_width / 2;
+            break;
+
+        case X264_CSP_I444:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width;
+            pic->img.i_stride[2] = i_width;
+            break;
+
+        case X264_CSP_YUYV:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 2 * i_width * i_height );
+            pic->img.i_stride[0] = 2 * i_width;
+            break;
+
+        case X264_CSP_RGB:
+        case X264_CSP_BGR:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height );
+            pic->img.i_stride[0] = 3 * i_width;
+            break;
+
+        case X264_CSP_BGRA:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 4 * i_width * i_height );
+            pic->img.i_stride[0] = 4 * i_width;
+            break;
+
+        default:
+            fprintf( stderr, "invalid CSP\n" );
+            pic->img.i_plane = 0;
+            break;
+    }
+}
+
+/****************************************************************************
+ * x264_picture_clean:
+ ****************************************************************************/
+void x264_picture_clean( x264_picture_t *pic )
+{
+    x264_free( pic->img.plane[0] );
+
+    /* just to be safe */
+    memset( pic, 0, sizeof( x264_picture_t ) );
+}
+
+/****************************************************************************
+ * x264_nal_encode:
+ ****************************************************************************/
+int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal )
+{
+    uint8_t *dst = p_data;
+    uint8_t *src = nal->p_payload;
+    uint8_t *end = &nal->p_payload[nal->i_payload];
+
+    int i_count = 0;
+
+    /* FIXME this code doesn't check overflow */
+
+    if( b_annexeb )
+    {
+        /* long nal start code (we always use long ones)*/
+        *dst++ = 0x00;
+        *dst++ = 0x00;
+        *dst++ = 0x00;
+        *dst++ = 0x01;
+    }
+
+    /* nal header */
+    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
+
+    while( src < end )
+    {
+        if( i_count == 2 && *src <= 0x03 )
+        {
+            *dst++ = 0x03;
+            i_count = 0;
+        }
+        if( *src == 0 )
+        {
+            i_count++;
+        }
+        else
+        {
+            i_count = 0;
+        }
+        *dst++ = *src++;
+    }
+    *pi_data = dst - (uint8_t*)p_data;
+
+    return *pi_data;
+}
+
+/****************************************************************************
+ * x264_nal_decode:
+ ****************************************************************************/
+int x264_nal_decode( x264_nal_t *nal, void *p_data, int i_data )
+{
+    uint8_t *src = p_data;
+    uint8_t *end = &src[i_data];
+    uint8_t *dst = nal->p_payload;
+
+    nal->i_type    = src[0]&0x1f;
+    nal->i_ref_idc = (src[0] >> 5)&0x03;
+
+    src++;
+
+    while( src < end )
+    {
+        if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00  && src[2] == 0x03 )
+        {
+            *dst++ = 0x00;
+            *dst++ = 0x00;
+
+            src += 3;
+            continue;
+        }
+        *dst++ = *src++;
+    }
+
+    nal->i_payload = dst - (uint8_t*)p_data;
+    return 0;
+}
+
+
+
+/****************************************************************************
+ * x264_malloc:
+ ****************************************************************************/
+void *x264_malloc( int i_size )
+{
+#ifdef HAVE_MALLOC_H
+    return memalign( 16, i_size );
+#else
+    uint8_t * buf;
+    uint8_t * align_buf;
+    buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
+              sizeof( int ) );
+    align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
+    align_buf -= (long) align_buf & 15;
+    *( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
+    *( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
+    return align_buf;
+#endif
+}
+
+/****************************************************************************
+ * x264_free:
+ ****************************************************************************/
+void x264_free( void *p )
+{
+    if( p )
+    {
+#ifdef HAVE_MALLOC_H
+        free( p );
+#else
+        free( *( ( ( void **) p ) - 1 ) );
+#endif
+    }
+}
+
+/****************************************************************************
+ * x264_realloc:
+ ****************************************************************************/
+void *x264_realloc( void *p, int i_size )
+{
+#ifdef HAVE_MALLOC_H
+    return realloc( p, i_size );
+#else
+    int       i_old_size = 0;
+    uint8_t * p_new;
+    if( p )
+    {
+        i_old_size = *( (int*) ( (uint8_t*) p ) - sizeof( void ** ) -
+                         sizeof( int ) );
+    }
+    p_new = x264_malloc( i_size );
+    if( i_old_size > 0 && i_size > 0 )
+    {
+        memcpy( p_new, p, ( i_old_size < i_size ) ? i_old_size : i_size );
+    }
+    x264_free( p );
+    return p_new;
+#endif
+}
+
diff --git a/core/common.h b/core/common.h
new file mode 100644
index 00000000..e5c85c05
--- /dev/null
+++ b/core/common.h
@@ -0,0 +1,344 @@
+/*****************************************************************************
+ * common.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: common.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _COMMON_H
+#define _COMMON_H 1
+
+#include <stdint.h>
+
+#include "../x264.h"
+#include "bs.h"
+#include "set.h"
+#include "predict.h"
+#include "pixel.h"
+#include "mc.h"
+#include "frame.h"
+#include "dct.h"
+#include "cabac.h"
+#include "csp.h"
+
+#define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) )
+#define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) )
+#define X264_ABS(a)   ( (a)< 0 ? -(a) : (a) )
+
+/* x264_malloc : will do or emulate a memalign
+ * XXX you HAVE TO use x264_free for buffer allocated
+ * with x264_malloc
+ */
+void *x264_malloc( int );
+void *x264_realloc( void *p, int i_size );
+void  x264_free( void * );
+
+/* mdate: return the current date in microsecond */
+int64_t x264_mdate( void );
+
+static inline int x264_clip3( int v, int i_min, int i_max )
+{
+    if( v < i_min )
+    {
+        return i_min;
+    }
+    else if( v > i_max )
+    {
+        return i_max;
+    }
+    else
+    {
+        return v;
+    }
+}
+
+enum slice_type_e
+{
+    SLICE_TYPE_P  = 0,
+    SLICE_TYPE_B  = 1,
+    SLICE_TYPE_I  = 2,
+    SLICE_TYPE_SP = 3,
+    SLICE_TYPE_SI = 4
+};
+
+typedef struct
+{
+    x264_sps_t *sps;
+    x264_pps_t *pps;
+
+    int i_type;
+    int i_first_mb;
+
+    int i_pps_id;
+
+    int i_frame_num;
+
+    int b_field_pic;
+    int b_bottom_field;
+
+    int i_idr_pic_id;   /* -1 if nal_type != 5 */
+
+    int i_poc_lsb;
+    int i_delta_poc_bottom;
+
+    int i_delta_poc[2];
+    int i_redundant_pic_cnt;
+
+    int b_direct_spatial_mv_pred;
+
+    int b_num_ref_idx_override;
+    int i_num_ref_idx_l0_active;
+    int i_num_ref_idx_l1_active;
+
+    int i_cabac_init_idc;
+
+    int i_qp_delta;
+    int b_sp_for_swidth;
+    int i_qs_delta;
+
+    /* deblocking filter */
+    int i_disable_deblocking_filter_idc;
+    int i_alpha_c0_offset;
+    int i_beta_offset;
+
+} x264_slice_header_t;
+
+/* From ffmpeg
+ */
+#define X264_SCAN8_SIZE (6*8)
+#define X264_SCAN8_0 (4+1*8)
+
+static const int x264_scan8[16+2*4] =
+{
+    /* Luma */
+    4+1*8, 5+1*8, 4+2*8, 5+2*8,
+    6+1*8, 7+1*8, 6+2*8, 7+2*8,
+    4+3*8, 5+3*8, 4+4*8, 5+4*8,
+    6+3*8, 7+3*8, 6+4*8, 7+4*8,
+
+    /* Cb */
+    1+1*8, 2+1*8,
+    1+2*8, 2+2*8,
+
+    /* Cr */
+    1+4*8, 2+4*8,
+    1+5*8, 2+5*8,
+};
+/*
+   0 1 2 3 4 5 6 7
+ 0
+ 1   B B   L L L L
+ 2   B B   L L L L
+ 3         L L L L
+ 4   R R   L L L L
+ 5   R R
+*/
+
+#define X264_BFRAME_MAX 16
+
+typedef struct x264_ratecontrol_t   x264_ratecontrol_t;
+typedef struct x264_vlc_table_t     x264_vlc_table_t;
+
+struct x264_t
+{
+    /* encoder parameters */
+    x264_param_t    param;
+
+    /* bitstream output */
+    struct
+    {
+        int         i_nal;
+        x264_nal_t  nal[3];         /* for now 3 is enought */
+        int         i_bitstream;    /* size of p_bitstream */
+        uint8_t     *p_bitstream;   /* will hold data for all nal */
+        bs_t        bs;
+    } out;
+
+    /* frame number/poc */
+    int             i_frame;
+    int             i_poc;
+
+    int             i_frame_offset; /* decoding only */
+    int             i_frame_num;    /* decoding only */
+    int             i_poc_msb;      /* decoding only */
+    int             i_poc_lsb;      /* decoding only */
+
+    /* We use only one SPS and one PPS */
+    x264_sps_t      sps_array[32];
+    x264_sps_t      *sps;
+    x264_pps_t      pps_array[256];
+    x264_pps_t      *pps;
+    int             i_idr_pic_id;
+
+    /* Slice header */
+    x264_slice_header_t sh;
+
+    /* cabac context */
+    x264_cabac_t    cabac;
+
+    struct
+    {
+        /* Frames to be encoded */
+        x264_frame_t *current[X264_BFRAME_MAX+1];
+        /* Temporary buffer (eg B frames pending until we reach the I/P) */
+        x264_frame_t *next[X264_BFRAME_MAX+1];
+        /* Unused frames */
+        x264_frame_t *unused[X264_BFRAME_MAX+1];
+
+        /* frames used for reference +1 for decoding */
+        x264_frame_t *reference[16+1];
+
+        int i_last_idr; /* How many I non IDR frames from last IDR */
+        int i_last_i;   /* How many P/B frames from last I */
+    } frames;
+
+    /* current frame being encoded */
+    x264_frame_t    *fenc;
+
+    /* frame being reconstructed */
+    x264_frame_t    *fdec;
+
+    /* references lists */
+    int             i_ref0;
+    x264_frame_t    *fref0[16];       /* ref list 0 */
+    int             i_ref1;
+    x264_frame_t    *fref1[16];       /* ref list 1 */
+
+
+
+    /* Current MB DCT coeffs */
+    struct
+    {
+        DECLARE_ALIGNED( int, luma16x16_dc[16], 16 );
+        DECLARE_ALIGNED( int, chroma_dc[2][4], 16 );
+        struct
+        {
+            DECLARE_ALIGNED( int, residual_ac[15], 16 );
+            DECLARE_ALIGNED( int, luma4x4[16], 16 );
+        } block[16+8];
+    } dct;
+
+    /* MB table and cache for current frame/mb */
+    struct
+    {
+        /* Strides */
+        int     i_mb_stride;
+
+        /* Current index */
+        int     i_mb_x;
+        int     i_mb_y;
+        int     i_mb_xy;
+
+        unsigned int i_neighbour;
+
+        /* mb table */
+        int8_t  *type;                      /* mb type */
+        int8_t  *qp;                        /* mb qp */
+        int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
+        int8_t  (*intra4x4_pred_mode)[7];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+        uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
+        int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
+        int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
+        int16_t (*mvd[2])[2];               /* mb mv difference with predict. set to 0 if intra. cabac only */
+        int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only */
+
+        /* current value */
+        int     i_type;
+        int     i_partition;
+        int     i_sub_partition[4];
+
+        int     i_cbp_luma;
+        int     i_cbp_chroma;
+
+        int     i_intra16x16_pred_mode;
+        int     i_chroma_pred_mode;
+
+        struct
+        {
+            /* pointer over mb of the frame to be compressed */
+            uint8_t *p_fenc[3];
+
+            /* pointer over mb of the frame to be reconstrucated  */
+            uint8_t *p_fdec[3];
+
+            /* pointer over mb of the references */
+            uint8_t *p_fref[2][16][3];
+
+            /* common stride */
+            int     i_stride[3];
+        } pic;
+
+        /* cache */
+        struct
+        {
+            /* real intra4x4_pred_mode if I_4X4, I_PRED_4x4_DC if mb available, -1 if not */
+            int     intra4x4_pred_mode[X264_SCAN8_SIZE];
+
+            /* i_non_zero_count if availble else 0x80 */
+            int     non_zero_count[X264_SCAN8_SIZE];
+
+            /* -1 if unused, -2 if unavaible */
+            int8_t  ref[2][X264_SCAN8_SIZE];
+
+            /* 0 if non avaible */
+            int16_t mv[2][X264_SCAN8_SIZE][2];
+            int16_t mvd[2][X264_SCAN8_SIZE][2];
+        } cache;
+
+        /* */
+        int     i_last_qp;  /* last qp */
+        int     i_last_dqp; /* last delta qp */
+
+    } mb;
+
+    /* rate control encoding only */
+    x264_ratecontrol_t *rc;
+
+    /* stats */
+    struct
+    {
+        /* per slice info */
+        int   i_slice_count[5];
+        int   i_slice_size[5];
+        float f_psnr_y[5];
+        float f_psnr_u[5];
+        float f_psnr_v[5];
+        int   i_mb_count[5][18];
+    } stat;
+
+    /* CPU functions dependants */
+    x264_predict_t      predict_16x16[4+3];
+    x264_predict_t      predict_8x8[4+3];
+    x264_predict_t      predict_4x4[9+3];
+
+    x264_pixel_function_t pixf;
+    x264_mc_function_t    mc[2];
+    x264_dct_function_t   dctf;
+    x264_csp_function_t   csp;
+
+    /* vlc table for decoding purpose only */
+    x264_vlc_table_t *x264_coeff_token_lookup[5];
+    x264_vlc_table_t *x264_level_prefix_lookup;
+    x264_vlc_table_t *x264_total_zeros_lookup[15];
+    x264_vlc_table_t *x264_total_zeros_dc_lookup[3];
+    x264_vlc_table_t *x264_run_before_lookup[7];
+};
+
+#endif
+
diff --git a/core/cpu.c b/core/cpu.c
new file mode 100644
index 00000000..8e6d1e3e
--- /dev/null
+++ b/core/cpu.c
@@ -0,0 +1,233 @@
+/*****************************************************************************
+ * cpu.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cpu.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "cpu.h"
+
+#ifdef ARCH_X86
+extern int  x264_cpu_cpuid_test( void );
+extern uint32_t  x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
+extern void x264_emms( void );
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t cpu = 0;
+
+    uint32_t eax, ebx, ecx, edx;
+    int      b_amd;
+
+
+    if( !x264_cpu_cpuid_test() )
+    {
+        /* No cpuid */
+        return 0;
+    }
+
+    x264_cpu_cpuid( 0, &eax, &ebx, &ecx, &edx);
+    if( eax == 0 )
+    {
+        return 0;
+    }
+    b_amd   = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
+    if( (edx&0x00800000) == 0 )
+    {
+        /* No MMX */
+        return 0;
+    }
+    cpu = X264_CPU_MMX;
+    if( (edx&0x02000000) )
+    {
+        /* SSE - identical to AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
+    }
+    if( (edx&0x04000000) )
+    {
+        /* Is it OK ? */
+        cpu |= X264_CPU_SSE2;
+    }
+
+    x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
+    if( eax < 0x80000001 )
+    {
+        /* no extended capabilities */
+        return cpu;
+    }
+
+    x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
+    if( edx&0x80000000 )
+    {
+        cpu |= X264_CPU_3DNOW;
+    }
+    if( b_amd && (edx&0x00400000) )
+    {
+        /* AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT;
+    }
+
+    return cpu;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+    if( cpu&(X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_3DNOW|X264_CPU_3DNOWEXT) )
+    {
+        x264_emms();
+    }
+}
+
+
+#if 0
+/*
+ * XXX: adapted from libmpeg2 */
+#if 0
+#define cpuid(op,eax,ebx,ecx,edx)   \
+    __asm__ ("push %%ebx\n\t"       \
+             "cpuid\n\t"            \
+             "movl %%ebx,%1\n\t"    \
+             "pop %%ebx"        \
+             : "=a" (eax),      \
+               "=r" (ebx),      \
+               "=c" (ecx),      \
+               "=d" (edx)       \
+             : "a" (op)         \
+             : "cc")
+#endif
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t cpu = 0;
+
+    uint32_t eax, ebx, ecx, edx;
+    int      b_amd;
+
+
+    /* Test if cpuid is supported */
+    asm volatile(
+        "pushf\n"
+        "pushf\n"
+        "pop %0\n"
+        "movl %0,%1\n"
+        "xorl $0x200000,%0\n"
+        "push %0\n"
+        "popf\n"
+        "pushf\n"
+        "pop %0\n"
+        "popf\n"
+         : "=r" (eax), "=r" (ebx) : : "cc");
+
+    if( eax == ebx )
+    {
+        /* No cpuid */
+        return 0;
+    }
+
+    cpuid( 0, eax, ebx, ecx, edx);
+    if( eax == 0 )
+    {
+        return 0;
+    }
+    b_amd   = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+    cpuid( 1, eax, ebx, ecx, edx );
+    if( (edx&0x00800000) == 0 )
+    {
+        /* No MMX */
+        return 0;
+    }
+    cpu = X264_CPU_MMX;
+    if( (edx&0x02000000) )
+    {
+        /* SSE - identical to AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
+    }
+    if( (edx&0x04000000) )
+    {
+        /* Is it OK ? */
+        cpu |= X264_CPU_SSE2;
+    }
+
+    cpuid( 0x80000000, eax, ebx, ecx, edx );
+    if( eax < 0x80000001 )
+    {
+        /* no extended capabilities */
+        return cpu;
+    }
+
+    cpuid( 0x80000001, eax, ebx, ecx, edx );
+    if( edx&0x80000000 )
+    {
+        cpu |= X264_CPU_3DNOW;
+    }
+    if( b_amd && (edx&0x00400000) )
+    {
+        /* AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT;
+    }
+
+    return cpu;
+}
+#endif
+
+#elif defined( HAVE_ALTIVEC )
+#include <sys/sysctl.h>
+
+uint32_t x264_cpu_detect( void )
+{
+    /* Thx VLC */
+    uint32_t cpu = 0;
+    int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
+    int      has_altivec = 0;
+    size_t   length = sizeof( has_altivec );
+    int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
+
+    if( error == 0 && has_altivec != 0 )
+    {
+        cpu |= X264_CPU_ALTIVEC;
+    }
+
+    return cpu;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+}
+
+#else
+
+uint32_t x264_cpu_detect( void )
+{
+    return 0;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+}
+
+#endif
+
diff --git a/core/cpu.h b/core/cpu.h
new file mode 100644
index 00000000..a9df3f83
--- /dev/null
+++ b/core/cpu.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * cpu.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cpu.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CPU_H
+#define _CPU_H 1
+
+uint32_t x264_cpu_detect( void );
+
+/* probably MMX(EXT) centric but .... */
+void     x264_cpu_restore( uint32_t cpu );
+
+#endif
diff --git a/core/csp.c b/core/csp.c
new file mode 100644
index 00000000..1dda6b60
--- /dev/null
+++ b/core/csp.c
@@ -0,0 +1,379 @@
+/*****************************************************************************
+ * csp.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: csp.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common.h"
+
+static inline void plane_copy( uint8_t *dst, int i_dst,
+                               uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        memcpy( dst, src, w );
+        dst += i_dst;
+        src += i_src;
+    }
+}
+static inline void plane_copy_vflip( uint8_t *dst, int i_dst,
+                                     uint8_t *src, int i_src, int w, int h)
+{
+    plane_copy( dst, i_dst, src + (h -1)*i_src, -i_src, w, h );
+}
+
+static inline void plane_subsamplev2( uint8_t *dst, int i_dst,
+                                      uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        uint8_t *d = dst;
+        uint8_t *s = src;
+        int     i;
+        for( i = 0; i < w; i++ )
+        {
+            *d++ = ( s[0] + s[i_src] + 1 ) >> 1;
+            s++;
+        }
+        dst += i_dst;
+        src += 2 * i_src;
+    }
+}
+
+static inline void plane_subsamplev2_vlip( uint8_t *dst, int i_dst,
+                                           uint8_t *src, int i_src, int w, int h)
+{
+    plane_subsamplev2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
+}
+
+static inline void plane_subsamplehv2( uint8_t *dst, int i_dst,
+                                       uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        uint8_t *d = dst;
+        uint8_t *s = src;
+        int     i;
+        for( i = 0; i < w; i++ )
+        {
+            *d++ = ( s[0] + s[1] + s[i_src] + s[i_src+1] + 1 ) >> 2;
+            s += 2;
+        }
+        dst += i_dst;
+        src += 2 * i_src;
+    }
+}
+
+static inline void plane_subsamplehv2_vlip( uint8_t *dst, int i_dst,
+                                            uint8_t *src, int i_src, int w, int h)
+{
+    plane_subsamplehv2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
+}
+
+static void i420_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+        plane_copy_vflip( frm->plane[1], frm->i_stride[1],
+                          img->plane[1], img->i_stride[1],
+                          i_width / 2, i_height / 2 );
+        plane_copy_vflip( frm->plane[2], frm->i_stride[2],
+                          img->plane[2], img->i_stride[2],
+                          i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+        plane_copy( frm->plane[1], frm->i_stride[1],
+                    img->plane[1], img->i_stride[1],
+                    i_width / 2, i_height / 2 );
+        plane_copy( frm->plane[2], frm->i_stride[2],
+                    img->plane[2], img->i_stride[2],
+                    i_width / 2, i_height / 2 );
+    }
+}
+
+static void yv12_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+        plane_copy_vflip( frm->plane[2], frm->i_stride[2],
+                          img->plane[1], img->i_stride[1],
+                          i_width / 2, i_height / 2 );
+        plane_copy_vflip( frm->plane[1], frm->i_stride[1],
+                          img->plane[2], img->i_stride[2],
+                          i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+        plane_copy( frm->plane[2], frm->i_stride[2],
+                    img->plane[1], img->i_stride[1],
+                    i_width / 2, i_height / 2 );
+        plane_copy( frm->plane[1], frm->i_stride[1],
+                    img->plane[2], img->i_stride[2],
+                    i_width / 2, i_height / 2 );
+    }
+}
+
+static void i422_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+
+        plane_subsamplev2_vlip( frm->plane[1], frm->i_stride[1],
+                                img->plane[1], img->i_stride[1],
+                                i_width / 2, i_height / 2 );
+        plane_subsamplev2_vlip( frm->plane[2], frm->i_stride[2],
+                                img->plane[2], img->i_stride[2],
+                                i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+
+        plane_subsamplev2( frm->plane[1], frm->i_stride[1],
+                           img->plane[1], img->i_stride[1],
+                           i_width / 2, i_height / 2 );
+        plane_subsamplev2( frm->plane[2], frm->i_stride[2],
+                           img->plane[2], img->i_stride[2],
+                           i_width / 2, i_height / 2 );
+    }
+}
+
+static void i444_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+
+        plane_subsamplehv2_vlip( frm->plane[1], frm->i_stride[1],
+                                 img->plane[1], img->i_stride[1],
+                                 i_width / 2, i_height / 2 );
+        plane_subsamplehv2_vlip( frm->plane[2], frm->i_stride[2],
+                                 img->plane[2], img->i_stride[2],
+                                 i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+
+        plane_subsamplehv2( frm->plane[1], frm->i_stride[1],
+                            img->plane[1], img->i_stride[1],
+                            i_width / 2, i_height / 2 );
+        plane_subsamplehv2( frm->plane[2], frm->i_stride[2],
+                            img->plane[2], img->i_stride[2],
+                            i_width / 2, i_height / 2 );
+    }
+}
+static void yuyv_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    uint8_t *src = img->plane[0];
+    int     i_src= img->i_stride[0];
+
+    uint8_t *y   = frm->plane[0];
+    uint8_t *u   = frm->plane[1];
+    uint8_t *v   = frm->plane[2];
+
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        src += ( i_height - 1 ) * i_src;
+        i_src = -i_src;
+    }
+
+    for( ; i_height > 0; i_height -= 2 )
+    {
+        uint8_t *ss = src;
+        uint8_t *yy = y;
+        uint8_t *uu = u;
+        uint8_t *vv = v;
+        int w;
+
+        for( w = i_width; w > 0; w -= 2 )
+        {
+            *yy++ = ss[0];
+            *yy++ = ss[2];
+
+            *uu++ = ( ss[1] + ss[1+i_src] + 1 ) >> 1;
+            *vv++ = ( ss[3] + ss[3+i_src] + 1 ) >> 1;
+
+            ss += 4;
+        }
+        src += i_src;
+        y += frm->i_stride[0];
+        u += frm->i_stride[1];
+        v += frm->i_stride[2];
+
+        ss = src;
+        yy = y;
+        for( w = i_width; w > 0; w -= 2 )
+        {
+            *yy++ = ss[0];
+            *yy++ = ss[2];
+            ss += 4;
+        }
+        src += i_src;
+        y += frm->i_stride[0];
+    }
+}
+
+/* Same value than in XviD */
+#define BITS 8
+#define FIX(f) ((int)((f) * (1 << BITS) + 0.5))
+
+#define Y_R   FIX(0.257)
+#define Y_G   FIX(0.504)
+#define Y_B   FIX(0.098)
+#define Y_ADD 16
+
+#define U_R   FIX(0.148)
+#define U_G   FIX(0.291)
+#define U_B   FIX(0.439)
+#define U_ADD 128
+
+#define V_R   FIX(0.439)
+#define V_G   FIX(0.368)
+#define V_B   FIX(0.071)
+#define V_ADD 128
+#define RGB_TO_I420( name, POS_R, POS_G, POS_B, S_RGB ) \
+static void name( x264_frame_t *frm, x264_image_t *img, \
+                  int i_width, int i_height )           \
+{                                                       \
+    uint8_t *src = img->plane[0];                       \
+    int     i_src= img->i_stride[0];                    \
+    int     i_y  = frm->i_stride[0];                    \
+    uint8_t *y   = frm->plane[0];                       \
+    uint8_t *u   = frm->plane[1];                       \
+    uint8_t *v   = frm->plane[2];                       \
+                                                        \
+    if( img->i_csp & X264_CSP_VFLIP )                   \
+    {                                                   \
+        src += ( i_height - 1 ) * i_src;                \
+        i_src = -i_src;                                 \
+    }                                                   \
+                                                        \
+    for(  ; i_height > 0; i_height -= 2 )               \
+    {                                                   \
+        uint8_t *ss = src;                              \
+        uint8_t *yy = y;                                \
+        uint8_t *uu = u;                                \
+        uint8_t *vv = v;                                \
+        int w;                                          \
+                                                        \
+        for( w = i_width; w > 0; w -= 2 )               \
+        {                                               \
+            int cr = 0,cg = 0,cb = 0;                   \
+            int r, g, b;                                \
+                                                        \
+            /* Luma */                                  \
+            cr = r = ss[POS_R];                         \
+            cg = g = ss[POS_G];                         \
+            cb = b = ss[POS_B];                         \
+                                                        \
+            yy[0] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);    \
+                                                        \
+            cr+= r = ss[POS_R+i_src];                   \
+            cg+= g = ss[POS_G+i_src];                   \
+            cb+= b = ss[POS_B+i_src];                   \
+            yy[i_y] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);  \
+            yy++;                                       \
+            ss += S_RGB;                                \
+                                                        \
+            cr+= r = ss[POS_R];                         \
+            cg+= g = ss[POS_G];                         \
+            cb+= b = ss[POS_B];                         \
+                                                        \
+            yy[0] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);    \
+                                                        \
+            cr+= r = ss[POS_R+i_src];                   \
+            cg+= g = ss[POS_G+i_src];                   \
+            cb+= b = ss[POS_B+i_src];                   \
+            yy[i_y] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);  \
+            yy++;                                       \
+            ss += S_RGB;                                \
+                                                        \
+            /* Chroma */                                \
+            *uu++ = (uint8_t)(U_ADD + ((-U_R * cr - U_G * cg + U_B * cb) >> (BITS+2)) ); \
+            *vv++ = (uint8_t)(V_ADD + (( V_R * cr - V_G * cg - V_B * cb) >> (BITS+2)) ); \
+        }                                               \
+                                                        \
+        src += 2*i_src;                                   \
+        y += 2*frm->i_stride[0];                        \
+        u += frm->i_stride[1];                          \
+        v += frm->i_stride[2];                          \
+    }                                                   \
+}
+
+RGB_TO_I420( rgb_to_i420,  0, 1, 2, 3 );
+RGB_TO_I420( bgr_to_i420,  2, 1, 0, 3 );
+RGB_TO_I420( bgra_to_i420, 2, 1, 0, 4 );
+
+void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf )
+{
+    switch( i_csp )
+    {
+        case X264_CSP_I420:
+            pf->i420 = i420_to_i420;
+            pf->i422 = i422_to_i420;
+            pf->i444 = i444_to_i420;
+            pf->yv12 = yv12_to_i420;
+            pf->yuyv = yuyv_to_i420;
+            pf->rgb  = rgb_to_i420;
+            pf->bgr  = bgr_to_i420;
+            pf->bgra = bgra_to_i420;
+            break;
+
+        default:
+            /* For now, can't happen */
+            fprintf( stderr, "arg in x264_csp_init\n" );
+            exit( -1 );
+            break;
+    }
+}
+
diff --git a/core/csp.h b/core/csp.h
new file mode 100644
index 00000000..1b02795a
--- /dev/null
+++ b/core/csp.h
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * csp.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: csp.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CSP_H
+#define _CSP_H 1
+
+typedef struct
+{
+    void (*i420)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*i422)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*i444)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*yv12)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*yuyv)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*rgb )( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*bgr )( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*bgra)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+} x264_csp_function_t;
+
+
+void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf );
+
+#endif
+
diff --git a/core/dct.c b/core/dct.c
new file mode 100644
index 00000000..65aab0cf
--- /dev/null
+++ b/core/dct.c
@@ -0,0 +1,288 @@
+/*****************************************************************************
+ * dct.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+
+#include "dct.h"
+#ifdef HAVE_MMXEXT
+#   include "i386/dct.h"
+#endif
+
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/*
+ * XXX For all dct dc : input could be equal to output so ...
+ */
+
+static void dct2x2dc( int16_t d[2][2] )
+{
+    int tmp[2][2];
+
+    tmp[0][0] = d[0][0] + d[0][1];
+    tmp[1][0] = d[0][0] - d[0][1];
+    tmp[0][1] = d[1][0] + d[1][1];
+    tmp[1][1] = d[1][0] - d[1][1];
+
+    d[0][0] = tmp[0][0] + tmp[0][1];
+    d[0][1] = tmp[1][0] + tmp[1][1];
+    d[1][0] = tmp[0][0] - tmp[0][1];
+    d[1][1] = tmp[1][0] - tmp[1][1];
+}
+
+static void dct4x4dc( int16_t d[4][4] )
+{
+    int16_t tmp[4][4];
+    int s01, s23;
+    int d01, d23;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = d[i][0] + d[i][1];
+        d01 = d[i][0] - d[i][1];
+        s23 = d[i][2] + d[i][3];
+        d23 = d[i][2] - d[i][3];
+
+        tmp[0][i] = s01 + s23;
+        tmp[1][i] = s01 - s23;
+        tmp[2][i] = d01 - d23;
+        tmp[3][i] = d01 + d23;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = tmp[i][0] + tmp[i][1];
+        d01 = tmp[i][0] - tmp[i][1];
+        s23 = tmp[i][2] + tmp[i][3];
+        d23 = tmp[i][2] - tmp[i][3];
+
+        d[0][i] = ( s01 + s23 + 1 ) >> 1;
+        d[1][i] = ( s01 - s23 + 1 ) >> 1;
+        d[2][i] = ( d01 - d23 + 1 ) >> 1;
+        d[3][i] = ( d01 + d23 + 1 ) >> 1;
+    }
+}
+
+static void idct4x4dc( int16_t d[4][4] )
+{
+    int16_t tmp[4][4];
+    int s01, s23;
+    int d01, d23;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = d[0][i] + d[1][i];
+        d01 = d[0][i] - d[1][i];
+        s23 = d[2][i] + d[3][i];
+        d23 = d[2][i] - d[3][i];
+
+        tmp[0][i] = s01 + s23;
+        tmp[1][i] = s01 - s23;
+        tmp[2][i] = d01 - d23;
+        tmp[3][i] = d01 + d23;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = tmp[i][0] + tmp[i][1];
+        d01 = tmp[i][0] - tmp[i][1];
+        s23 = tmp[i][2] + tmp[i][3];
+        d23 = tmp[i][2] - tmp[i][3];
+
+        d[i][0] = s01 + s23;
+        d[i][1] = s01 - s23;
+        d[i][2] = d01 - d23;
+        d[i][3] = d01 + d23;
+    }
+}
+
+static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    int16_t d[4][4];
+    int16_t tmp[4][4];
+    int y, x;
+    int i;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            d[y][x] = pix1[x] - pix2[x];
+        }
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s03 = d[i][0] + d[i][3];
+        const int s12 = d[i][1] + d[i][2];
+        const int d03 = d[i][0] - d[i][3];
+        const int d12 = d[i][1] - d[i][2];
+
+        tmp[0][i] =   s03 +   s12;
+        tmp[1][i] = 2*d03 +   d12;
+        tmp[2][i] =   s03 -   s12;
+        tmp[3][i] =   d03 - 2*d12;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s03 = tmp[i][0] + tmp[i][3];
+        const int s12 = tmp[i][1] + tmp[i][2];
+        const int d03 = tmp[i][0] - tmp[i][3];
+        const int d12 = tmp[i][1] - tmp[i][2];
+
+        dct[0][i] =   s03 +   s12;
+        dct[1][i] = 2*d03 +   d12;
+        dct[2][i] =   s03 -   s12;
+        dct[3][i] =   d03 - 2*d12;
+    }
+}
+
+static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    sub4x4_dct( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    sub4x4_dct( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
+    sub4x4_dct( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
+    sub4x4_dct( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+}
+
+static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    sub8x8_dct( &dct[ 0], pix1, i_pix1, pix2, i_pix2 );
+    sub8x8_dct( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
+    sub8x8_dct( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
+    sub8x8_dct( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+}
+
+
+static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+{
+    int16_t d[4][4];
+    int16_t tmp[4][4];
+    int x, y;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s02 = dct[0][i]      + dct[2][i];
+        const int d02 = dct[0][i]      - dct[2][i];
+        const int s13 = dct[1][i]      + (dct[3][i]>>1);
+        const int d13 = (dct[1][i]>>1) -  dct[3][i];
+
+        tmp[0][i] = s02 + s13;
+        tmp[1][i] = d02 + d13;
+        tmp[2][i] = d02 - d13;
+        tmp[3][i] = s02 - s13;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s02 =  tmp[i][0]     +  tmp[i][2];
+        const int d02 =  tmp[i][0]     -  tmp[i][2];
+        const int s13 =  tmp[i][1]     + (tmp[i][3]>>1);
+        const int d13 = (tmp[i][1]>>1) -  tmp[i][3];
+
+        d[i][0] = ( s02 + s13 + 32 ) >> 6;
+        d[i][1] = ( d02 + d13 + 32 ) >> 6;
+        d[i][2] = ( d02 - d13 + 32 ) >> 6;
+        d[i][3] = ( s02 - s13 + 32 ) >> 6;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            p_dst[x] = clip_uint8( p_dst[x] + d[y][x] );
+        }
+        p_dst += i_dst;
+    }
+}
+
+static void add8x8_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+{
+    add4x4_idct( p_dst, i_dst,             dct[0] );
+    add4x4_idct( &p_dst[4], i_dst,         dct[1] );
+    add4x4_idct( &p_dst[4*i_dst+0], i_dst, dct[2] );
+    add4x4_idct( &p_dst[4*i_dst+4], i_dst, dct[3] );
+}
+
+static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+{
+    add8x8_idct( &p_dst[0], i_dst, &dct[0] );
+    add8x8_idct( &p_dst[8], i_dst, &dct[4] );
+    add8x8_idct( &p_dst[8*i_dst], i_dst, &dct[8] );
+    add8x8_idct( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+}
+
+
+
+/****************************************************************************
+ * x264_dct_init:
+ ****************************************************************************/
+void x264_dct_init( int cpu, x264_dct_function_t *dctf )
+{
+    dctf->sub4x4_dct    = sub4x4_dct;
+    dctf->add4x4_idct   = add4x4_idct;
+
+    dctf->sub8x8_dct    = sub8x8_dct;
+    dctf->add8x8_idct   = add8x8_idct;
+
+    dctf->sub16x16_dct    = sub16x16_dct;
+    dctf->add16x16_idct   = add16x16_idct;
+
+    dctf->dct4x4dc  = dct4x4dc;
+    dctf->idct4x4dc = idct4x4dc;
+
+    dctf->dct2x2dc  = dct2x2dc;
+    dctf->idct2x2dc = dct2x2dc;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        dctf->sub4x4_dct    = x264_sub4x4_dct_mmxext;
+        dctf->sub8x8_dct    = x264_sub8x8_dct_mmxext;
+        dctf->sub16x16_dct  = x264_sub16x16_dct_mmxext;
+
+        dctf->add4x4_idct   = x264_add4x4_idct_mmxext;
+        dctf->add8x8_idct   = x264_add8x8_idct_mmxext;
+        dctf->add16x16_idct = x264_add16x16_idct_mmxext;
+
+        dctf->dct4x4dc  = x264_dct4x4dc_mmxext;
+        dctf->idct4x4dc = x264_idct4x4dc_mmxext;
+    }
+#endif
+}
+
diff --git a/core/dct.h b/core/dct.h
new file mode 100644
index 00000000..bedbbf43
--- /dev/null
+++ b/core/dct.h
@@ -0,0 +1,49 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DCT_H
+#define _DCT_H 1
+
+typedef struct
+{
+    void (*sub4x4_dct)   ( int16_t dct[4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add4x4_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+
+    void (*sub8x8_dct)   ( int16_t dct[4][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add8x8_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+
+    void (*sub16x16_dct)   ( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add16x16_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+
+
+    void (*dct4x4dc) ( int16_t d[4][4] );
+    void (*idct4x4dc)( int16_t d[4][4] );
+
+    void (*dct2x2dc) ( int16_t d[2][2] );
+    void (*idct2x2dc)( int16_t d[2][2] );
+
+} x264_dct_function_t;
+
+void x264_dct_init( int cpu, x264_dct_function_t *dctf );
+
+#endif
diff --git a/core/frame.c b/core/frame.c
new file mode 100644
index 00000000..2e926176
--- /dev/null
+++ b/core/frame.c
@@ -0,0 +1,701 @@
+/*****************************************************************************
+ * frame.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: frame.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "common.h"
+#include "macroblock.h"
+
+x264_frame_t *x264_frame_new( x264_t *h )
+{
+    x264_frame_t   *frame = x264_malloc( sizeof( x264_frame_t ) );
+    int i;
+
+    int i_stride;
+    int i_lines;
+
+    /* allocate frame data (+64 for extra data for me) */
+    i_stride = ( ( h->param.i_width  + 15 )&0xfffff0 )+ 64;
+    i_lines  = ( ( h->param.i_height + 15 )&0xfffff0 );
+
+    frame->i_plane = 3;
+    for( i = 0; i < 3; i++ )
+    {
+        int i_divh = 1;
+        int i_divw = 1;
+        if( i > 0 )
+        {
+            if( h->param.i_csp == X264_CSP_I420 )
+                i_divh = i_divw = 2;
+            else if( h->param.i_csp == X264_CSP_I422 )
+                i_divw = 2;
+        }
+        frame->i_stride[i] = i_stride / i_divw;
+        frame->i_lines[i] = i_lines / i_divh;
+        frame->buffer[i] = x264_malloc( frame->i_stride[i] *
+                                        ( frame->i_lines[i] + 64 / i_divh ) );
+
+        frame->plane[i] = ((uint8_t*)frame->buffer[i]) +
+                          frame->i_stride[i] * 32 / i_divh + 32 / i_divw;
+    }
+    frame->i_stride[3] = 0;
+    frame->i_lines[3] = 0;
+    frame->buffer[3] = NULL;
+    frame->plane[3] = NULL;
+
+    frame->i_poc = -1;
+    frame->i_type = X264_TYPE_AUTO;
+    frame->i_qpplus1 = 0;
+
+    return frame;
+}
+
+void x264_frame_delete( x264_frame_t *frame )
+{
+    int i;
+    for( i = 0; i < frame->i_plane; i++ )
+    {
+        x264_free( frame->buffer[i] );
+    }
+    x264_free( frame );
+}
+
+void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
+{
+    dst->i_type     = src->i_type;
+    dst->i_qpplus1  = src->i_qpplus1;
+    dst->i_pts      = src->i_pts;
+
+    switch( src->img.i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+            h->csp.i420( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_YV12:
+            h->csp.yv12( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_I422:
+            h->csp.i422( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_I444:
+            h->csp.i444( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_YUYV:
+            h->csp.yuyv( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_RGB:
+            h->csp.rgb( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_BGR:
+            h->csp.bgr( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_BGRA:
+            h->csp.bgra( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+
+        default:
+            fprintf( stderr, "Arg invalid CSP\n" );
+            break;
+    }
+}
+
+
+
+void x264_frame_expand_border( x264_frame_t *frame )
+{
+    int w;
+    int i, y;
+    for( i = 0; i < frame->i_plane; i++ )
+    {
+#define PPIXEL(x, y) ( frame->plane[i] + (x) +(y)*frame->i_stride[i] )
+        w = ( i == 0 ) ? 32 : 16;
+
+        for( y = 0; y < w; y++ )
+        {
+            /* upper band */
+            memcpy( PPIXEL(0,-y-1), PPIXEL(0,0), frame->i_stride[i] - 2 * w);
+            /* up left corner */
+            memset( PPIXEL(-w,-y-1 ), PPIXEL(0,0)[0], w );
+            /* up right corner */
+            memset( PPIXEL(frame->i_stride[i] - 2*w,-y-1), PPIXEL( frame->i_stride[i]-1-2*w,0)[0], w );
+
+            /* lower band */
+            memcpy( PPIXEL(0, frame->i_lines[i]+y), PPIXEL(0,frame->i_lines[i]-1), frame->i_stride[i] - 2 * w );
+            /* low left corner */
+            memset( PPIXEL(-w, frame->i_lines[i]+y), PPIXEL(0,frame->i_lines[i]-1)[0], w);
+            /* low right corner */
+            memset( PPIXEL(frame->i_stride[i]-2*w, frame->i_lines[i]+y), PPIXEL(frame->i_stride[i]-1-2*w,frame->i_lines[i]-1)[0], w);
+
+        }
+        for( y = 0; y < frame->i_lines[i]; y++ )
+        {
+            /* left band */
+            memset( PPIXEL( -w, y ), PPIXEL( 0, y )[0], w );
+            /* right band */
+            memset( PPIXEL( frame->i_stride[i]-2*w, y ), PPIXEL( frame->i_stride[i] - 1-2*w, y )[0], w );
+        }
+#undef PPIXEL
+    }
+}
+
+/* FIXME theses tables are duplicated with the ones in macroblock.c */
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+/* Deblocking filter (p153) */
+static const int i_alpha_table[52] =
+{
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+    80, 90,101,113,127,144,162,182,203,226,
+    255, 255
+};
+static const int i_beta_table[52] =
+{
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18
+};
+static const int i_tc0_table[52][3] =
+{
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
+    { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
+    { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
+    { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
+    { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
+    { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
+    { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 }
+};
+
+/* From ffmpeg */
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+static inline void deblocking_filter_edgev( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 4 * i_pix_stride;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
+
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int p2 = pix[-3];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+                const int q2 = pix[2];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int tc = tc0;
+                    int i_delta;
+
+                    if( abs( p2 - p0 ) < beta )
+                    {
+                        pix[-2] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+                    if( abs( q2 - q0 ) < beta )
+                    {
+                        pix[1] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+
+                    i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                    pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+        else
+        {
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int p2 = pix[-3];
+
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+                const int q2 = pix[2];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) )
+                    {
+                        if( abs( p2 - p0 ) < beta )
+                        {
+                            const int p3 = pix[-4];
+                            /* p0', p1', p2' */
+                            pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                            pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                            pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* p0' */
+                            pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        }
+                        if( abs( q2 - q0 ) < beta )
+                        {
+                            const int q3 = pix[3];
+                            /* q0', q1', q2' */
+                            pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                            pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                            pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* q0' */
+                            pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                        }
+                    }
+                    else
+                    {
+                        /* p0', q0' */
+                        pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }
+                pix += i_pix_stride;
+            }
+        }
+    }
+}
+
+static inline void deblocking_filter_edgecv( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 2 * i_pix_stride;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1;
+            /* 2px edge length (because we use same bS than the one for luma) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    const int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                    pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+        else
+        {
+            /* 2px edge length (because we use same bS than the one for luma) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                    pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+    }
+}
+
+static inline void deblocking_filter_edgeh( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    int i_pix_next  = i_pix_stride;
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 4;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int p2 = pix[-3*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+                const int q2 = pix[2*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int tc = tc0;
+                    int i_delta;
+
+                    if( abs( p2 - p0 ) < beta )
+                    {
+                        pix[-2*i_pix_next] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+                    if( abs( q2 - q0 ) < beta )
+                    {
+                        pix[i_pix_next] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+
+                    i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                    pix[-i_pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]           = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix++;
+            }
+        }
+        else
+        {
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int p2 = pix[-3*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+                const int q2 = pix[2*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    const int p3 = pix[-4*i_pix_next];
+                    const int q3 = pix[ 3*i_pix_next];
+
+                    if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) )
+                    {
+                        if( abs( p2 - p0 ) < beta )
+                        {
+                            /* p0', p1', p2' */
+                            pix[-1*i_pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                            pix[-2*i_pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                            pix[-3*i_pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* p0' */
+                            pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        }
+                        if( abs( q2 - q0 ) < beta )
+                        {
+                            /* q0', q1', q2' */
+                            pix[0*i_pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                            pix[1*i_pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                            pix[2*i_pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* q0' */
+                            pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                        }
+                    }
+                    else
+                    {
+                        /* p0' */
+                        pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        /* q0' */
+                        pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }
+                pix++;
+            }
+
+        }
+    }
+}
+
+static inline void deblocking_filter_edgech( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    int i_pix_next  = i_pix_stride;
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 2;
+            continue;
+        }
+        if( bS[i] < 4 )
+        {
+            int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1;
+            /* 2px edge length (see deblocking_filter_edgecv) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1*i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                    pix[-i_pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]           = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix++;
+            }
+        }
+        else
+        {
+            /* 2px edge length (see deblocking_filter_edgecv) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1*i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    pix[-i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                    pix[0]           = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                }
+                pix++;
+            }
+        }
+    }
+}
+
+void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
+{
+    const int s8x8 = 2 * h->mb.i_mb_stride;
+    const int s4x4 = 4 * h->mb.i_mb_stride;
+
+    int mb_y, mb_x;
+
+    for( mb_y = 0, mb_x = 0; mb_y < h->sps->i_mb_height; )
+    {
+        const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
+        const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
+        const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
+        int i_edge;
+        int i_dir;
+
+        /* i_dir == 0 -> vertical edge
+         * i_dir == 1 -> horizontal edge */
+        for( i_dir = 0; i_dir < 2; i_dir++ )
+        {
+            int i_start;
+            int i_qp, i_qpn;
+
+            i_start = (( i_dir == 0 && mb_x != 0 ) || ( i_dir == 1 && mb_y != 0 ) ) ? 0 : 1;
+
+            for( i_edge = i_start; i_edge < 4; i_edge++ )
+            {
+                int mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );
+                int mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );
+                int mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );
+
+                int bS[4];  /* filtering strength */
+
+                /* *** Get bS for each 4px for the current edge *** */
+                if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )
+                {
+                    bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 ? 4 : 3 );
+                }
+                else
+                {
+                    int i;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        int x  = i_dir == 0 ? i_edge : i;
+                        int y  = i_dir == 0 ? i      : i_edge;
+                        int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;
+                        int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;
+
+                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||
+                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )
+                        {
+                            bS[i] = 2;
+                        }
+                        else if( i_slice_type == SLICE_TYPE_P )
+                        {
+                            if( h->mb.ref[0][mb_8x8+(x/2)+(y/2)*s8x8] != h->mb.ref[0][mbn_8x8+(xn/2)+(yn/2)*s8x8] ||
+                                abs( h->mb.mv[0][mb_4x4+x+y*s4x4][0] - h->mb.mv[0][mbn_4x4+xn+yn*s4x4][0] ) >= 4 ||
+                                abs( h->mb.mv[0][mb_4x4+x+y*s4x4][1] - h->mb.mv[0][mbn_4x4+xn+yn*s4x4][1] ) >= 4 )
+                            {
+                                bS[i] = 1;
+                            }
+                            else
+                            {
+                                bS[i] = 0;
+                            }
+                        }
+                        else
+                        {
+                            /* FIXME */
+                            fprintf( stderr, "deblocking filter doesn't work yet with B slice\n" );
+                            return;
+                        }
+                    }
+                }
+
+                /* *** filter *** */
+                /* Y plane */
+                i_qp = h->mb.qp[mb_xy];
+                i_qpn= h->mb.qp[mbn_xy];
+
+                if( i_dir == 0 )
+                {
+                    /* vertical edge */
+                    deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge],
+                                                h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1);
+                    if( (i_edge % 2) == 0  )
+                    {
+                        /* U/V planes */
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
+                        deblocking_filter_edgecv( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2],
+                                                      h->fdec->i_stride[1], bS, i_qpc );
+                        deblocking_filter_edgecv( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2],
+                                                  h->fdec->i_stride[2], bS, i_qpc );
+                    }
+                }
+                else
+                {
+                    /* horizontal edge */
+                    deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x],
+                                                h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 );
+                    /* U/V planes */
+                    if( ( i_edge % 2  ) == 0 )
+                    {
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
+                        deblocking_filter_edgech( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2*h->fdec->i_stride[1]],
+                                                 h->fdec->i_stride[1], bS, i_qpc );
+                        deblocking_filter_edgech( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2*h->fdec->i_stride[2]],
+                                                 h->fdec->i_stride[2], bS, i_qpc );
+                    }
+                }
+            }
+        }
+
+        /* newt mb */
+        mb_x++;
+        if( mb_x >= h->sps->i_mb_width )
+        {
+            mb_x = 0;
+            mb_y++;
+        }
+    }
+}
+
+
+
+
diff --git a/core/frame.h b/core/frame.h
new file mode 100644
index 00000000..da7c4576
--- /dev/null
+++ b/core/frame.h
@@ -0,0 +1,56 @@
+/*****************************************************************************
+ * frame.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: frame.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _FRAME_H
+#define _FRAME_H 1
+
+typedef struct
+{
+    /* */
+    int     i_poc;
+    int     i_type;
+    int     i_qpplus1;
+    int64_t i_pts;
+
+    /* YUV buffer */
+    int     i_plane;
+    int     i_stride[4];
+    int     i_lines[4];
+    uint8_t *plane[4];
+
+    /* for unrestricted mv we allocate more data than needed
+     * allocated data are stored in buffer */
+    void    *buffer[4];
+
+} x264_frame_t;
+
+x264_frame_t *x264_frame_new( x264_t *h );
+void          x264_frame_delete( x264_frame_t *frame );
+
+void          x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
+
+void          x264_frame_expand_border( x264_frame_t *frame );
+
+void          x264_frame_deblocking_filter( x264_t *h, int i_slice_type );
+
+#endif
diff --git a/core/i386/cpu.asm b/core/i386/cpu.asm
new file mode 100644
index 00000000..06ac1e05
--- /dev/null
+++ b/core/i386/cpu.asm
@@ -0,0 +1,111 @@
+;*****************************************************************************
+;* cpu.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_cpu_cpuid_test
+cglobal x264_cpu_cpuid
+cglobal x264_emms
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported
+;-----------------------------------------------------------------------------
+x264_cpu_cpuid_test:
+    pushfd
+    push    ebx
+
+    pushfd
+    pop     eax
+    mov     ebx, eax
+    xor     eax, 0x200000
+    push    eax
+    popfd
+    pushfd
+    pop     eax
+    xor     eax, ebx
+    
+    pop     ebx
+    popfd
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+;-----------------------------------------------------------------------------
+x264_cpu_cpuid:
+
+    push    ebp
+    mov     ebp,    esp
+    push    ebx
+    push    esi
+    push    edi
+    
+    mov     eax,    [ebp +  8]
+    cpuid
+
+    mov     esi,    [ebp + 12]
+    mov     [esi],  eax
+
+    mov     esi,    [ebp + 16]
+    mov     [esi],  ebx
+
+    mov     esi,    [ebp + 20]
+    mov     [esi],  ecx
+
+    mov     esi,    [ebp + 24]
+    mov     [esi],  edx
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_emms( void )
+;-----------------------------------------------------------------------------
+x264_emms:
+    emms
+    ret
+
diff --git a/core/i386/dct-c.c b/core/i386/dct-c.c
new file mode 100644
index 00000000..d824a23f
--- /dev/null
+++ b/core/i386/dct-c.c
@@ -0,0 +1,294 @@
+/*****************************************************************************
+ * dct.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+
+#include "../dct.h"
+#include "dct.h"
+
+
+#if 0
+#define MMX_ZERO( MMZ ) \
+    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
+
+/* MMP : diff,  MMT: temp */
+#define MMX_LOAD_DIFF_4P( MMP, MMT, MMZ, pix1, pix2 ) \
+    asm volatile( "movd (%0), " #MMP "\n" \
+                  "punpcklbw  " #MMZ ", " #MMP "\n" \
+                  "movd (%1), " #MMT "\n" \
+                  "punpcklbw  " #MMZ ", " #MMT "\n" \
+                  "psubw      " #MMT ", " #MMP "\n" : : "r"(pix1), "r"(pix2) )
+
+/* in: out: mma=mma+mmb, mmb=mmb-mma */
+#define MMX_SUMSUB_BA( MMA, MMB ) \
+    asm volatile( "paddw " #MMB ", " #MMA "\n"\
+                  "paddw " #MMB ", " #MMB "\n"\
+                  "psubw " #MMA ", " #MMB "\n" :: )
+
+#define MMX_SUMSUB_BADC( MMA, MMB, MMC, MMD ) \
+    asm volatile( "paddw " #MMB ", " #MMA "\n"\
+                  "paddw " #MMD ", " #MMC "\n"\
+                  "paddw " #MMB ", " #MMB "\n"\
+                  "paddw " #MMD ", " #MMD "\n"\
+                  "psubw " #MMA ", " #MMB "\n"\
+                  "psubw " #MMC ", " #MMD "\n" :: )
+
+/* inputs MMA, MMB output MMA MMT */
+#define MMX_SUMSUB2_AB( MMA, MMB, MMT ) \
+    asm volatile( "movq  " #MMA ", " #MMT "\n" \
+                  "paddw " #MMA ", " #MMA "\n" \
+                  "paddw " #MMB ", " #MMA "\n" \
+                  "psubw " #MMB ", " #MMT "\n" \
+                  "psubw " #MMB ", " #MMT "\n" :: )
+
+/* inputs MMA, MMB output MMA MMS */
+#define MMX_SUMSUBD2_AB( MMA, MMB, MMT, MMS ) \
+    asm volatile( "movq  " #MMA ", " #MMS "\n" \
+                  "movq  " #MMB ", " #MMT "\n" \
+                  "psraw   $1    , " #MMB "\n"       \
+                  "psraw   $1    , " #MMS "\n"       \
+                  "paddw " #MMB ", " #MMA "\n" \
+                  "psubw " #MMT ", " #MMS "\n" :: )
+
+#define SBUTTERFLYwd(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpcklwd " #b ", " #a "   \n\t" \
+                  "punpckhwd " #b ", " #t "   \n\t" :: )
+
+#define SBUTTERFLYdq(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpckldq " #b ", " #a "   \n\t" \
+                  "punpckhdq " #b ", " #t "   \n\t" :: )
+
+/* input ABCD output ADTC */
+#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
+        SBUTTERFLYwd( MMA, MMB, MMT ); \
+        SBUTTERFLYwd( MMC, MMD, MMB ); \
+        SBUTTERFLYdq( MMA, MMC, MMD ); \
+        SBUTTERFLYdq( MMT, MMB, MMC )
+
+#define MMX_STORE_DIFF_4P( MMP, MMT, MM32, MMZ, dst ) \
+    asm volatile( "paddw     " #MM32 "," #MMP "\n" \
+                  "psraw       $6,     " #MMP "\n" \
+                  "movd        (%0),   " #MMT "\n" \
+                  "punpcklbw " #MMZ ", " #MMT "\n" \
+                  "paddsw    " #MMT ", " #MMP "\n" \
+                  "packuswb  " #MMZ ", " #MMP "\n" \
+                  "movd      " #MMP ",   (%0)\n" :: "r"(dst) )
+
+#define UNUSED_LONGLONG( foo ) \
+    static const unsigned long long foo __asm__ (#foo)  __attribute__((unused)) __attribute__((aligned(16)))
+
+UNUSED_LONGLONG( x264_mmx_32 ) = 0x0020002000200020ULL;
+UNUSED_LONGLONG( x264_mmx_1 ) = 0x0001000100010001ULL;
+
+
+/*
+ * XXX For all dct dc : input could be equal to output so ...
+ */
+void x264_dct4x4dc_mmxext( int16_t d[4][4] )
+{
+    /* load DCT */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n" :: "r"(d) );
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 );  /* mm1=s01  mm0=d01  mm3=s23  mm2=d23 */
+    MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 );  /* mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23 */
+
+    /* in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 */
+    MMX_TRANSPOSE  ( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 );  /* mm2=s01  mm3=d01  mm0=s23  mm4=d23 */
+    MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 );  /* mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23 */
+
+    /* in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3 */
+    MMX_TRANSPOSE  ( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
+
+
+    asm volatile( "movq x264_mmx_1, %%mm6" :: );
+
+    /* Store back */
+    asm volatile(
+        "paddw %%mm6, %%mm0\n"
+        "paddw %%mm6, %%mm4\n"
+
+        "psraw $1,    %%mm0\n"
+        "movq  %%mm0,   (%0)\n"
+        "psraw $1,    %%mm4\n"
+        "movq  %%mm4,  8(%0)\n"
+
+        "paddw %%mm6, %%mm1\n"
+        "paddw %%mm6, %%mm3\n"
+
+        "psraw $1,    %%mm1\n"
+        "movq  %%mm1, 16(%0)\n"
+        "psraw $1,    %%mm3\n"
+        "movq  %%mm3, 24(%0)\n" :: "r"(d) );
+}
+
+void x264_idct4x4dc_mmxext( int16_t d[4][4] )
+{
+    /* load DCT */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n" 
+        "movq 24(%0), %%mm3\n" :: "r"(d) );
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 );  /* mm1=s01  mm0=d01  mm3=s23  mm2=d23 */
+    MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 );  /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */
+
+    /* in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 */
+    MMX_TRANSPOSE( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 );  /* mm2=s01  mm3=d01  mm0=s23  mm4=d23 */
+    MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 );  /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */
+
+    /* in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3 */
+    MMX_TRANSPOSE( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
+
+    /* Store back */
+    asm volatile(
+        "movq %%mm0,   (%0)\n"
+        "movq %%mm4,  8(%0)\n"
+        "movq %%mm1, 16(%0)\n" 
+        "movq %%mm3, 24(%0)\n" :: "r"(d) );
+}
+
+/****************************************************************************
+ * subXxX_dct:
+ ****************************************************************************/
+inline void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    /* Reset mm7 */
+    MMX_ZERO( %%mm7 );
+
+    /* Load 4 lines */
+    MMX_LOAD_DIFF_4P( %%mm0, %%mm6, %%mm7, &pix1[0*i_pix1], &pix2[0*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm1, %%mm6, %%mm7, &pix1[1*i_pix1], &pix2[1*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm2, %%mm6, %%mm7, &pix1[2*i_pix1], &pix2[2*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm3, %%mm6, %%mm7, &pix1[3*i_pix1], &pix2[3*i_pix2] );
+
+    MMX_SUMSUB_BADC( %%mm3, %%mm0, %%mm2, %%mm1 );  /* mm3=s03  mm0=d03  mm2=s12  mm1=d12 */
+
+    MMX_SUMSUB_BA(  %%mm2, %%mm3 );                 /* mm2=s03+s12      mm3=s03-s12 */
+    MMX_SUMSUB2_AB( %%mm0, %%mm1, %%mm4 );          /* mm0=2.d03+d12    mm4=d03-2.d12 */
+
+    /* transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 */
+    MMX_TRANSPOSE( %%mm2, %%mm0, %%mm3, %%mm4, %%mm1 );
+
+    MMX_SUMSUB_BADC( %%mm3, %%mm2, %%mm1, %%mm4 );  /* mm3=s03  mm2=d03  mm1=s12  mm4=d12 */
+
+    MMX_SUMSUB_BA(  %%mm1, %%mm3 );                 /* mm1=s03+s12      mm3=s03-s12 */
+    MMX_SUMSUB2_AB( %%mm2, %%mm4, %%mm0 );          /* mm2=2.d03+d12    mm0=d03-2.d12 */
+
+    /* transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3 */
+    MMX_TRANSPOSE( %%mm1, %%mm2, %%mm3, %%mm0, %%mm4 );
+
+    /* Store back */
+    asm volatile(
+        "movq %%mm1, (%0)\n"
+        "movq %%mm0, 8(%0)\n"
+        "movq %%mm4, 16(%0)\n"
+        "movq %%mm3, 24(%0)\n" :: "r"(dct) );
+}
+#endif
+
+void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    x264_sub4x4_dct_mmxext( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+}
+
+void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    x264_sub8x8_dct_mmxext( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+}
+
+
+
+/****************************************************************************
+ * addXxX_idct:
+ ****************************************************************************/
+#if 0
+inline void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+{
+    /* Load dct coeffs */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n" :: "r"(dct) );
+
+    MMX_SUMSUB_BA  ( %%mm2, %%mm0 );                /* mm2=s02  mm0=d02 */
+    MMX_SUMSUBD2_AB( %%mm1, %%mm3, %%mm5, %%mm4 );  /* mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm2, %%mm4, %%mm0 );  /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
+
+    /* in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0 */
+    MMX_TRANSPOSE  ( %%mm1, %%mm4, %%mm0, %%mm2, %%mm3 );
+
+    MMX_SUMSUB_BA  ( %%mm3, %%mm1 );                /* mm3=s02  mm1=d02 */
+    MMX_SUMSUBD2_AB( %%mm2, %%mm0, %%mm5, %%mm4 );  /* mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm4, %%mm1 );  /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
+
+    /* in: mm2, mm4, mm1, mm3  out: mm2, mm3, mm0, mm1 */
+    MMX_TRANSPOSE  ( %%mm2, %%mm4, %%mm1, %%mm3, %%mm0 );
+
+    MMX_ZERO( %%mm7 );
+    asm volatile( "movq x264_mmx_32, %%mm6\n" :: );
+
+    MMX_STORE_DIFF_4P( %%mm2, %%mm4, %%mm6, %%mm7, &p_dst[0*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm3, %%mm4, %%mm6, %%mm7, &p_dst[1*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm0, %%mm4, %%mm6, %%mm7, &p_dst[2*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm1, %%mm4, %%mm6, %%mm7, &p_dst[3*i_dst] );
+}
+#endif
+
+void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+{
+    x264_add4x4_idct_mmxext( p_dst, i_dst,             dct[0] );
+    x264_add4x4_idct_mmxext( &p_dst[4], i_dst,         dct[1] );
+    x264_add4x4_idct_mmxext( &p_dst[4*i_dst+0], i_dst, dct[2] );
+    x264_add4x4_idct_mmxext( &p_dst[4*i_dst+4], i_dst, dct[3] );
+}
+
+void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+{
+    x264_add8x8_idct_mmxext( &p_dst[0], i_dst, &dct[0] );
+    x264_add8x8_idct_mmxext( &p_dst[8], i_dst, &dct[4] );
+    x264_add8x8_idct_mmxext( &p_dst[8*i_dst], i_dst, &dct[8] );
+    x264_add8x8_idct_mmxext( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+}
diff --git a/core/i386/dct.asm b/core/i386/dct.asm
new file mode 100644
index 00000000..054daba7
--- /dev/null
+++ b/core/i386/dct.asm
@@ -0,0 +1,313 @@
+;*****************************************************************************
+;* dct.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
+;*          Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2004.04.28  portab all 4x4 function to nasm (CM)                         *
+;*                                                                           *
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro MMX_ZERO 1
+    pxor    %1, %1
+%endmacro
+
+%macro MMX_LOAD_DIFF_4P 5
+    movd        %1, %4
+    punpcklbw   %1, %3
+    movd        %2, %5
+    punpcklbw   %2, %3
+    psubw       %1, %2
+%endmacro
+
+%macro MMX_SUMSUB_BA 2
+    paddw   %1, %2
+    paddw   %2, %2
+    psubw   %2, %1
+%endmacro
+
+%macro MMX_SUMSUB_BADC 4
+    paddw   %1, %2
+    paddw   %3, %4
+    paddw   %2, %2
+    paddw   %4, %4
+    psubw   %2, %1
+    psubw   %4, %3
+%endmacro
+
+%macro MMX_SUMSUB2_AB 3
+    movq    %3, %1
+    paddw   %1, %1
+    paddw   %1, %2
+    psubw   %3, %2
+    psubw   %3, %2
+%endmacro
+
+%macro MMX_SUMSUBD2_AB 4
+    movq    %4, %1
+    movq    %3, %2
+    psraw   %2, $1
+    psraw   %4, $1
+    paddw   %1, %2
+    psubw   %4, %3
+%endmacro
+
+%macro SBUTTERFLYwd 3
+    movq        %3, %1
+    punpcklwd   %1, %2
+    punpckhwd   %3, %2
+%endmacro
+
+%macro SBUTTERFLYdq 3
+    movq        %3, %1
+    punpckldq   %1, %2
+    punpckhdq   %3, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; input ABCD output ADTC
+;-----------------------------------------------------------------------------
+%macro MMX_TRANSPOSE 5
+    SBUTTERFLYwd %1, %2, %5
+    SBUTTERFLYwd %3, %4, %2
+    SBUTTERFLYdq %1, %3, %4
+    SBUTTERFLYdq %5, %2, %3
+%endmacro
+
+%macro MMX_STORE_DIFF_4P 5
+    paddw       %1, %3
+    psraw       %1, $6
+    movd        %2, %5
+    punpcklbw   %2, %4
+    paddsw      %1, %2
+    packuswb    %1, %1
+    movd        %5, %1
+%endmacro
+
+;%macro 
+;%endmacro
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+;-----------------------------------------------------------------------------
+; Various memory constants (trigonometric values or rounding values)
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_mmx_1:
+  dw 1, 1, 1, 1
+
+x264_mmx_32:
+  dw 32, 32, 32, 32
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_dct4x4dc_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl dct4x4dc( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+x264_dct4x4dc_mmxext:
+    mov     eax,        [esp+ 4]
+    movq    mm0,        [eax+ 0]
+    movq    mm1,        [eax+ 8]
+    movq    mm2,        [eax+16]
+    movq    mm3,        [eax+24]
+
+    MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
+    MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23
+
+    MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
+    MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
+
+    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
+
+    movq    mm6,        [x264_mmx_1]
+    paddw   mm0,        mm6
+    paddw   mm4,        mm6
+    psraw   mm0,        1
+    movq    [eax+ 0],   mm0
+    psraw   mm4,        1
+    movq    [eax+ 8],   mm4
+    paddw   mm1,        mm6
+    paddw   mm3,        mm6
+    psraw   mm1,        1
+    movq    [eax+16],   mm1
+    psraw   mm3,        1
+    movq    [eax+24],   mm3
+    ret
+
+cglobal x264_idct4x4dc_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+x264_idct4x4dc_mmxext:
+    mov     eax, [esp+ 4]
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+    MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
+    MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
+
+    MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
+    MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
+
+    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
+
+    movq    [eax+ 0],   mm0
+    movq    [eax+ 8],   mm4
+    movq    [eax+16],   mm1
+    movq    [eax+24],   mm3
+    ret
+
+cglobal x264_sub4x4_dct_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+;-----------------------------------------------------------------------------
+x264_sub4x4_dct_mmxext:
+    push    ebx
+    mov     eax, [esp+12]   ; pix1
+    mov     ebx, [esp+16]   ; i_pix1
+    mov     ecx, [esp+20]   ; pix2
+    mov     edx, [esp+24]   ; i_pix2
+
+    MMX_ZERO    mm7
+
+    ; Load 4 lines
+    MMX_LOAD_DIFF_4P    mm0, mm6, mm7, [eax      ], [ecx]
+    MMX_LOAD_DIFF_4P    mm1, mm6, mm7, [eax+ebx  ], [ecx+edx]
+    MMX_LOAD_DIFF_4P    mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+    add     eax, ebx
+    add     ecx, edx
+    MMX_LOAD_DIFF_4P    mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+
+    MMX_SUMSUB_BADC     mm3, mm0, mm2, mm1          ; mm3=s03  mm0=d03  mm2=s12  mm1=d12
+
+    MMX_SUMSUB_BA       mm2, mm3                    ; mm2=s03+s12      mm3=s03-s12
+    MMX_SUMSUB2_AB      mm0, mm1, mm4               ; mm0=2.d03+d12    mm4=d03-2.d12
+
+    ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
+    MMX_TRANSPOSE       mm2, mm0, mm3, mm4, mm1
+
+    MMX_SUMSUB_BADC     mm3, mm2, mm1, mm4          ; mm3=s03  mm2=d03  mm1=s12  mm4=d12
+
+    MMX_SUMSUB_BA       mm1, mm3                    ; mm1=s03+s12      mm3=s03-s12
+    MMX_SUMSUB2_AB      mm2, mm4, mm0               ; mm2=2.d03+d12    mm0=d03-2.d12
+
+    ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
+    MMX_TRANSPOSE       mm1, mm2, mm3, mm0, mm4
+
+    mov     eax, [esp+ 8]   ; dct
+    movq    [eax+ 0],   mm1
+    movq    [eax+ 8],   mm0
+    movq    [eax+16],   mm4
+    movq    [eax+24],   mm3
+
+    pop     ebx
+    ret
+
+cglobal x264_add4x4_idct_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+x264_add4x4_idct_mmxext:
+
+    ; Load dct coeffs
+    mov     eax, [esp+12]   ; dct
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+    
+    mov     eax, [esp+ 4]   ; p_dst
+    mov     ecx, [esp+ 8]   ; i_dst
+    lea     edx, [ecx+ecx*2]
+
+    MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
+    MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+    MMX_SUMSUB_BADC     mm1, mm2, mm4, mm0              ; mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13
+
+    ; in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0
+    MMX_TRANSPOSE       mm1, mm4, mm0, mm2, mm3
+
+    MMX_SUMSUB_BA       mm3, mm1                        ; mm3=s02  mm1=d02
+    MMX_SUMSUBD2_AB     mm2, mm0, mm5, mm4              ; mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm4, mm1              ; mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13
+
+    ; in: mm2, mm4, mm1, mm3  out: mm2, mm3, mm0, mm1
+    MMX_TRANSPOSE       mm2, mm4, mm1, mm3, mm0
+
+    MMX_ZERO            mm7
+    movq                mm6, [x264_mmx_32]
+    
+    MMX_STORE_DIFF_4P   mm2, mm4, mm6, mm7, [eax]
+    MMX_STORE_DIFF_4P   mm3, mm4, mm6, mm7, [eax+ecx]
+    MMX_STORE_DIFF_4P   mm0, mm4, mm6, mm7, [eax+ecx*2]
+    MMX_STORE_DIFF_4P   mm1, mm4, mm6, mm7, [eax+edx]
+
+    ret
+
diff --git a/core/i386/dct.h b/core/i386/dct.h
new file mode 100644
index 00000000..23601e5e
--- /dev/null
+++ b/core/i386/dct.h
@@ -0,0 +1,38 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_DCT_H
+#define _I386_DCT_H 1
+
+void x264_sub4x4_dct_mmxext( int16_t dct[4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+
+void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+
+void x264_dct4x4dc_mmxext( int16_t d[4][4] );
+void x264_idct4x4dc_mmxext( int16_t d[4][4] );
+
+#endif
diff --git a/core/i386/mc-c.c b/core/i386/mc-c.c
new file mode 100644
index 00000000..aa7363b1
--- /dev/null
+++ b/core/i386/mc-c.c
@@ -0,0 +1,940 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../mc.h"
+#include "../clip1.h"
+#include "mc.h"
+
+#define UNUSED_UINT64( foo ) \
+    static const uint64_t foo __asm__ (#foo)  __attribute__((unused))
+
+UNUSED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;
+
+
+#define MMX_ZERO( MMZ ) \
+    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
+
+#define MMX_INIT( MMV, NAME ) \
+    asm volatile( "movq " #NAME ", " #MMV "\n" :: )
+
+#define MMX_SAVE_4P( MMP, MMZ, dst ) \
+    asm volatile( "packuswb " #MMZ  "," #MMP "\n" \
+                  "movd " #MMP ", (%0)" :: "r"(dst) )
+
+#define MMX_LOAD_4P( MMP, MMZ, pix ) \
+    asm volatile( "movd (%0), " #MMP "\n" \
+                  "punpcklbw  " #MMZ ", " #MMP "\n" : : "r"(pix) )
+
+#define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
+    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \
+    MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \
+    MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )
+
+#define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\
+    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )
+
+#define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \
+    asm volatile( "packuswb " #MMP2  "," #MMP1 "\n" \
+                  "movq " #MMP1 ", (%0)\n" :: "r"(dst) )
+
+
+#define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \
+    asm volatile( "movq         (%0)   , " #MMP1 "\n" \
+                  "movq       " #MMP1 ", " #MMP2 "\n" \
+                  "punpcklbw  " #MMZ  ", " #MMP1 "\n" \
+                  "punpckhbw  " #MMZ  ", " #MMP2 "\n" : : "r"(pix) )
+
+#define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
+    MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )
+
+#define SBUTTERFLYwd(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpcklwd " #b ", " #a "   \n\t" \
+                  "punpckhwd " #b ", " #t "   \n\t" :: )
+
+#define SBUTTERFLYdq(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpckldq " #b ", " #a "   \n\t" \
+                  "punpckhdq " #b ", " #t "   \n\t" :: )
+
+/* input ABCD output ADTC  ( or 0?31-2->0123 ) */
+#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
+        SBUTTERFLYwd( MMA, MMB, MMT ); \
+        SBUTTERFLYwd( MMC, MMD, MMB ); \
+        SBUTTERFLYdq( MMA, MMC, MMD ); \
+        SBUTTERFLYdq( MMT, MMB, MMC )
+
+/* first pass MM0 = MM0 -5*MM1 */
+#define MMX_FILTERTAP_P1( MMP0, MMP1 ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" :: )
+                                                   \
+/* second pass MM0 = MM0 + 20*(MM2+MM3) */
+#define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \
+    asm volatile( "paddw    " #MMP3 "," #MMP2 "\n" \
+                                                 \
+                  "psllw      $2,     " #MMP2 "\n" \
+                  "paddw    " #MMP2 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP2 "\n" \
+                  "paddw    " #MMP2 "," #MMP0 "\n" :: )
+
+/* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */
+#define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" \
+                                                   \
+                  "paddw    " #MMP2 "," #MMP0 "\n" \
+                  "paddw    " #MMV  "," #MMP0 "\n" \
+                  "psraw      $5,     " #MMP0 "\n" :: )
+
+#define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psubw    " #MMP3 "," #MMP2 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP3 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psubw    " #MMP3 "," #MMP2 "\n" :: )
+
+/* second pass MM0 = MM0 + 20*(MM1+MM2) */
+#define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \
+    asm volatile( "paddw    " #MMP2 "," #MMP1 "\n" \
+                  "paddw    " #MMP5 "," #MMP4 "\n" \
+                                                 \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP4 "\n" \
+                  "paddw    " #MMP1 "," #MMP0 "\n" \
+                  "paddw    " #MMP4 "," #MMP3 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP4 "\n" \
+                  "paddw    " #MMP1 "," #MMP0 "\n" \
+                  "paddw    " #MMP4 "," #MMP3 "\n" :: )
+
+#define MMX_LOAD_1r( m1, dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \
+
+#define MMX_SAVE_1r( m1, dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \
+
+#define MMX_LOAD_2r( m1, m2, dst, i_dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
+
+#define MMX_SAVE_2r( m1, m2, dst, i_dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
+
+#define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
+    asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
+    asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
+
+#define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
+
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
+}
+
+static inline void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
+                                 uint8_t *src1, int i_src1_stride,
+                                 uint8_t *src2, int i_src2_stride,
+                                 int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+static inline void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
+                                 uint8_t *src1, int i_src1_stride,
+                                 uint8_t *src2, int i_src2_stride,
+                                 int i_height )
+{
+    int y;
+    for( y = 0; y < i_height; y++ )
+    {
+        asm volatile(
+            "movq (%1), %%mm0\n"
+            "movq (%2), %%mm1\n"
+            "pavgb %%mm1, %%mm0\n"
+            "movq %%mm0, (%0)\n"
+            : : "r"(dst), "r"(src1), "r"(src2)
+            );
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+static inline void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
+                                  uint8_t *src1, int i_src1_stride,
+                                  uint8_t *src2, int i_src2_stride,
+                                  int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        asm volatile(
+            "movq (%1), %%mm0\n"
+            "movq 8(%1), %%mm2\n"
+            "movq (%2), %%mm1\n"
+            "movq 8(%2), %%mm3\n"
+
+            "pavgb %%mm1, %%mm0\n"
+            "movq %%mm0, (%0)\n"
+            "pavgb %%mm3, %%mm2\n"
+            "movq %%mm2, 8(%0)\n"
+            : : "r"(dst), "r"(src1), "r"(src2)
+            );
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+
+typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+
+/*****************************************************************************
+ * MC with width == 4 (height <= 8)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 4 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+
+static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    const int h4 = i_height / 4;
+    uint8_t  srct[4*8*3];
+    uint64_t tmp[4];
+    int y;
+
+    src -= 2;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < h4; y++ )
+    {
+        int i;
+
+        /* Preload data and transpose them */
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 );
+
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 );
+
+        /* we read 2 more bytes that needed */
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 );
+
+        /* tap filter */
+        for( i = 0; i < 4; i++ )
+        {
+            MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 );
+            MMX_FILTERTAP_P1( %%mm0, %%mm1 );
+            MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
+
+            MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 );
+            MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
+
+            MMX_SAVE_1r( %%mm0, &tmp[i] );
+        }
+
+        MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] );
+        MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] );
+        MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] );
+        MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] );
+
+        src += 4 * i_src;
+        dst += 4 * i_dst;
+    }
+}
+static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    int y;
+
+    src -= 2 * i_src;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src );
+        MMX_FILTERTAP_P1( %%mm0, %%mm1 );
+        MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
+
+        MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src );
+        MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 );
+        MMX_SAVE_4P( %%mm0, %%mm7, dst );
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int i, x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int16_t tap[5+4];
+
+        for( i = 0; i < 5+4; i++ )
+        {
+            tap[i] = x264_tapfilter( &src[-2+i], i_src_stride );
+        }
+
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
+        }
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+
+/* mc I+H */
+static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
+}
+static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
+}
+static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height );
+}
+/* H+V */
+static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy31_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src,   i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy13_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src,              i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src+1,            i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hv_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src,   i_src_stride, tmp1, 4, i_height );
+    mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src,              i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+
+
+/*****************************************************************************
+ * MC with width == 8 (height <= 16)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 8 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+
+static inline void mc_hh_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w4( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hh_w4( &src[4], i_src, &dst[4], i_dst, i_height );
+}
+static inline void mc_hv_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    int y;
+
+    src -= 2 * i_src;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        MMX_LOAD_2x8( %%mm0, %%mm5, %%mm1, %%mm2, %%mm7,  &src[0*i_src], i_src );
+        MMX_FILTERTAP2_P1( %%mm0, %%mm1, %%mm5, %%mm2 );
+
+
+        MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[2*i_src], i_src );
+        MMX_FILTERTAP2_P2( %%mm0, %%mm1, %%mm2, %%mm5, %%mm3, %%mm4 );
+
+        MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[4*i_src], i_src );
+        MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
+        MMX_FILTERTAP_P3( %%mm5, %%mm3, %%mm4, %%mm6, %%mm7 );
+
+        MMX_SAVEPACK_8P( %%mm0, %%mm5, %%mm7, dst );
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int x, y;
+
+    asm volatile( "pxor %%mm7,        %%mm7\n" : : );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int16_t tap[5+8];
+
+        /* first 8 */
+        asm volatile(
+            "leal   (%0, %1),   %%eax\n"
+
+            "movq       (%0),   %%mm0\n"    /* load pix-2 */
+            "movq       %%mm0,  %%mm2\n"
+            "punpcklbw  %%mm7,  %%mm0\n"
+            "punpckhbw  %%mm7,  %%mm2\n"
+
+            "movq       (%%eax),%%mm1\n"    /* load pix-1 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psubw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "psubw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1),%%mm1\n"  /* load pix */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       (%0,%1,4),%%mm1\n"  /* load pix+2 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psubw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "psubw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       %%mm0,   (%2)\n"
+            "movq       %%mm2,  8(%2)\n"
+
+
+            "addl   $8,         %%eax\n"
+            "addl   $8,         %0\n"
+
+
+            "movd       (%0),   %%mm0\n"    /* load pix-2 */
+            "punpcklbw  %%mm7,  %%mm0\n"
+
+            "movd       (%%eax),%%mm1\n"    /* load pix-1 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1),%%mm1\n"  /* load pix */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movd       (%0,%1,4),%%mm1\n"  /* load pix+2 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movq       %%mm0,  16(%2)\n"
+            : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" );
+
+        /* last one */
+        tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
+        }
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+
+/* mc I+H */
+static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
+}
+static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
+}
+static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height );
+}
+/* H+V */
+static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src,   i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src,              i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src+1,            i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hv_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src,   i_src_stride, tmp1, 8, i_height );
+    mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src,              i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+
+
+/*****************************************************************************
+ * MC with width == 16 (height <= 16)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 16 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+static inline void mc_hh_w16( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w4( &src[ 0], i_src, &dst[ 0], i_dst, i_height );
+    mc_hh_w4( &src[ 4], i_src, &dst[ 4], i_dst, i_height );
+    mc_hh_w4( &src[ 8], i_src, &dst[ 8], i_dst, i_height );
+    mc_hh_w4( &src[12], i_src, &dst[12], i_dst, i_height );
+}
+static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    mc_hv_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
+    mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
+}
+
+static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    mc_hc_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
+    mc_hc_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
+}
+
+/* mc I+H */
+static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
+}
+static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
+}
+static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
+}
+/* H+V */
+static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src,   i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src,              i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src+1,            i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src,   i_src_stride, tmp1, 16, i_height );
+    mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src,              i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src_stride,
+                                      uint8_t *dst, int i_dst_stride,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
+    {
+        {
+            { mc_copy_w4,  mc_xy10_w4,    mc_hh_w4,      mc_xy30_w4 },
+            { mc_xy01_w4,  mc_xy11_w4,    mc_xy21_w4,    mc_xy31_w4 },
+            { mc_hv_w4,    mc_xy12_w4,    mc_hc_w4,      mc_xy32_w4 },
+            { mc_xy03_w4,  mc_xy13_w4,    mc_xy23_w4,    mc_xy33_w4 },
+        },
+        {
+            { mc_copy_w8,  mc_xy10_w8,    mc_hh_w8,      mc_xy30_w8 },
+            { mc_xy01_w8,  mc_xy11_w8,    mc_xy21_w8,    mc_xy31_w8 },
+            { mc_hv_w8,    mc_xy12_w8,    mc_hc_w8,      mc_xy32_w8 },
+            { mc_xy03_w8,  mc_xy13_w8,    mc_xy23_w8,    mc_xy33_w8 },
+        },
+        {
+            { mc_copy_w16,  mc_xy10_w16,    mc_hh_w16,      mc_xy30_w16 },
+            { mc_xy01_w16,  mc_xy11_w16,    mc_xy21_w16,    mc_xy31_w16 },
+            { mc_hv_w16,    mc_xy12_w16,    mc_hc_w16,      mc_xy32_w16 },
+            { mc_xy03_w16,  mc_xy13_w16,    mc_xy23_w16,    mc_xy33_w16 },
+        }
+    };
+
+    src += (mvy >> 2) * i_src_stride + (mvx >> 2);
+    if( i_width == 4 )
+    {
+        pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else if( i_width == 8 )
+    {
+        pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else if( i_width == 16 )
+    {
+        pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else
+    {
+        fprintf( stderr, "Error: motion_compensation_luma called with invalid width" );
+    }
+}
+
+void x264_mc_mmxext_init( x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA]   = motion_compensation_luma;
+}
+
diff --git a/core/i386/mc.asm b/core/i386/mc.asm
new file mode 100644
index 00000000..0210a6b3
--- /dev/null
+++ b/core/i386/mc.asm
@@ -0,0 +1,187 @@
+;*****************************************************************************
+;* mc.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: mc.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
+;*          Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2004.05.17 portab mc_copy_w4/8/16 (CM)                                   *
+;*                                                                           *
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+;-----------------------------------------------------------------------------
+; Various memory constants (trigonometric values or rounding values)
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal mc_copy_w4
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w4:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    mov     eax, [esi]
+    mov     [edi], eax
+    mov     eax, [esi+ebx]
+    mov     [edi+edx], eax
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    dec     ecx
+    dec     ecx
+    jne     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+cglobal mc_copy_w8
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w8:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    movq    mm0, [esi]
+    movq    [edi], mm0
+    movq    mm1, [esi+ebx]
+    movq    [edi+edx], mm1
+    movq    mm2, [esi+ebx*2]
+    movq    [edi+edx*2], mm2
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    movq    mm3, [esi+ebx]
+    movq    [edi+edx], mm3
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    
+    sub     ecx, byte 4
+    jnz     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+cglobal mc_copy_w16
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w16:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    movq    mm0, [esi]
+    movq    mm1, [esi+8]
+    movq    [edi], mm0
+    movq    [edi+8], mm1
+    movq    mm2, [esi+ebx]
+    movq    mm3, [esi+ebx+8]
+    movq    [edi+edx], mm2
+    movq    [edi+edx+8], mm3
+    movq    mm4, [esi+ebx*2]
+    movq    mm5, [esi+ebx*2+8]
+    movq    [edi+edx*2], mm4
+    movq    [edi+edx*2+8], mm5
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    movq    mm6, [esi+ebx]
+    movq    mm7, [esi+ebx+8]
+    movq    [edi+edx], mm6
+    movq    [edi+edx+8], mm7
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    
+    sub     ecx, byte 4
+    jnz     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
diff --git a/core/i386/mc.h b/core/i386/mc.h
new file mode 100644
index 00000000..c3e906fc
--- /dev/null
+++ b/core/i386/mc.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_MC_H
+#define _I386_MC_H 1
+
+void x264_mc_mmxext_init( x264_mc_function_t pf[2] );
+
+#endif
diff --git a/core/i386/pixel.asm b/core/i386/pixel.asm
new file mode 100644
index 00000000..14015741
--- /dev/null
+++ b/core/i386/pixel.asm
@@ -0,0 +1,705 @@
+;*****************************************************************************
+;* pixel.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro SAD_INC_2x16P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+8]
+    movq    mm4,    [ecx+8]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    movq    mm1,    [eax+ebx]
+    movq    mm2,    [ecx+edx]
+    movq    mm3,    [eax+ebx+8]
+    movq    mm4,    [ecx+edx+8]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SAD_INC_2x8P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+ebx]
+    movq    mm4,    [ecx+edx]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SAD_INC_2x4P 0
+    movd    mm1,    [eax]
+    movd    mm2,    [ecx]
+    movd    mm3,    [eax+ebx]
+    movd    mm4,    [ecx+edx]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro LOAD_DIFF_4P 5  ; MMP, MMT, MMZ, [pix1], [pix2]
+    movd        %1, %4
+    punpcklbw   %1, %3
+    movd        %2, %5
+    punpcklbw   %2, %3
+    psubw       %1, %2
+%endmacro
+
+%macro LOAD_DIFF_INC_4x4 11 ; p1,p2,p3,p4, t, z, pix1, i_pix1, pix2, i_pix2, offset
+    LOAD_DIFF_4P %1, %5, %6, [%7+%11],    [%9+%11]
+    LOAD_DIFF_4P %2, %5, %6, [%7+%8+%11], [%9+%10+%11]
+    lea %7, [%7+2*%8]
+    lea %9, [%9+2*%10]
+    LOAD_DIFF_4P %3, %5, %6, [%7+%11],    [%9+%11]
+    LOAD_DIFF_4P %4, %5, %6, [%7+%8+%11], [%9+%10+%11]
+    lea %7, [%7+2*%8]
+    lea %9, [%9+2*%10]
+%endmacro
+
+%macro HADAMARD4_SUB_BADC 4
+    paddw %1,   %2
+    paddw %3,   %4
+    paddw %2,   %2
+    paddw %4,   %4
+    psubw %2,   %1
+    psubw %4,   %3
+%endmacro
+
+%macro HADAMARD4x4 4
+    HADAMARD4_SUB_BADC %1, %2, %3, %4
+    HADAMARD4_SUB_BADC %1, %3, %2, %4
+%endmacro
+
+%macro SBUTTERFLYwd 3
+    movq        %3, %1
+    punpcklwd   %1, %2
+    punpckhwd   %3, %2
+%endmacro
+
+%macro SBUTTERFLYdq 3
+    movq        %3, %1
+    punpckldq   %1, %2
+    punpckhdq   %3, %2
+%endmacro
+
+%macro TRANSPOSE4x4 5   ; abcd-t -> adtc
+    SBUTTERFLYwd %1, %2, %5
+    SBUTTERFLYwd %3, %4, %2
+    SBUTTERFLYdq %1, %3, %4
+    SBUTTERFLYdq %5, %2, %3
+%endmacro
+
+%macro MMX_ABS 2        ; mma, mmt
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+%endmacro
+
+%macro MMX_ABS_SUM 3    ; mma, mmt, mms
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+    paddusw %3, %1
+%endmacro
+
+
+%macro MMX_SUM_MM 2     ; mmv, mmt
+    movq    %2, %1
+    psrlq   %1, 32
+    paddusw %1, %2
+    movq    %2, %1
+    psrlq   %1, 16
+    paddusw %1, %2
+    movd    eax,%1
+    and     eax,0xffff
+    shr     eax,1
+%endmacro
+
+%macro HADAMARD4x4_FIRST 0
+    HADAMARD4x4 mm0, mm1, mm2, mm3
+    TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
+    HADAMARD4x4 mm0, mm3, mm4, mm2
+    MMX_ABS     mm0, mm7
+    MMX_ABS_SUM mm3, mm7, mm0
+    MMX_ABS_SUM mm4, mm7, mm0
+    MMX_ABS_SUM mm2, mm7, mm0
+%endmacro
+
+%macro HADAMARD4x4_NEXT 0
+    HADAMARD4x4 mm1, mm2, mm3, mm4
+    TRANSPOSE4x4 mm1, mm2, mm3, mm4, mm5
+    HADAMARD4x4 mm1, mm4, mm5, mm3
+    MMX_ABS_SUM mm1, mm7, mm0
+    MMX_ABS_SUM mm4, mm7, mm0
+    MMX_ABS_SUM mm5, mm7, mm0
+    MMX_ABS_SUM mm3, mm7, mm0
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_pixel_sad_16x16_mmxext
+cglobal x264_pixel_sad_16x8_mmxext
+cglobal x264_pixel_sad_8x16_mmxext
+cglobal x264_pixel_sad_8x8_mmxext
+cglobal x264_pixel_sad_8x4_mmxext
+cglobal x264_pixel_sad_4x8_mmxext
+cglobal x264_pixel_sad_4x4_mmxext
+
+cglobal x264_pixel_satd_4x4_mmxext
+cglobal x264_pixel_satd_4x8_mmxext
+cglobal x264_pixel_satd_8x4_mmxext
+cglobal x264_pixel_satd_8x8_mmxext
+cglobal x264_pixel_satd_16x8_mmxext
+cglobal x264_pixel_satd_8x16_mmxext
+cglobal x264_pixel_satd_16x16_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x16_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x16_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_4x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_4x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_4x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_4x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_4x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+
+    LOAD_DIFF_4P mm0, mm6, mm7, [eax],       [ecx]
+    LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx],   [ecx+edx]
+    LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
+    add eax, ebx
+    add ecx, edx
+    LOAD_DIFF_4P mm3, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
+
+    HADAMARD4x4_FIRST
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_4x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ecx,    [esp+16]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ecx,    [esp+16]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x8_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add         eax, ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x16_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     eax,    ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x16_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     ebp,    eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     ebp,    eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     eax,    ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
diff --git a/core/i386/pixel.h b/core/i386/pixel.h
new file mode 100644
index 00000000..799cbfde
--- /dev/null
+++ b/core/i386/pixel.h
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_PIXEL_H
+#define _I386_PIXEL_H 1
+
+int x264_pixel_sad_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+#endif
diff --git a/core/i386/predict.c b/core/i386/predict.c
new file mode 100644
index 00000000..587416bd
--- /dev/null
+++ b/core/i386/predict.c
@@ -0,0 +1,429 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* XXX predict4x4 are inspired from ffmpeg h264 decoder
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"   /* for keyword inline */
+#include "../predict.h"
+#include "predict.h"
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/****************************************************************************
+ * 16x16 prediction for intra block DC, H, V, P
+ ****************************************************************************/
+static void predict_16x16_dc( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    /* calculate DC value */
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+        dc += src[i - i_stride];
+    }
+    dc = (( dc + 16 ) >> 5) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_left( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+    }
+    dc = (( dc + 8 ) >> 4) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_top( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[i - i_stride];
+    }
+    dc = (( dc + 8 ) >> 4) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_128( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        const uint32_t v = 0x01010101 * src[-1];
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = v;
+        *p++ = v;
+        *p++ = v;
+        *p++ = v;
+
+        src += i_stride;
+
+    }
+}
+static void predict_16x16_v( uint8_t *src, int i_stride )
+{
+    int i;
+
+    asm volatile(
+        "movq  (%0), %%mm0\n"
+        "movq 8(%0), %%mm1\n" :: "r"(&src[-i_stride]) );
+
+    for( i = 0; i < 16; i++ )
+    {
+        asm volatile(
+            "movq %%mm0,  (%0)\n"
+            "movq %%mm1, 8(%0)\n" :: "r"(src) );
+        src += i_stride;
+    }
+}
+
+/****************************************************************************
+ * 8x8 prediction for intra chroma block DC, H, V, P
+ ****************************************************************************/
+static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+{
+    int y;
+
+    for( y = 0; y < 8; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc0 = 0, dc1 = 0;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dc0 += src[y * i_stride     - 1];
+        dc1 += src[(y+4) * i_stride - 1];
+    }
+    dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
+    dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc0;
+
+        src += i_stride;
+    }
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc1;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+
+}
+static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+{
+    int y, x;
+    uint32_t dc0 = 0, dc1 = 0;
+
+    for( x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - i_stride];
+        dc1 += src[x + 4 - i_stride];
+    }
+    dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
+    dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 8; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc( uint8_t *src, int i_stride )
+{
+    int y;
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    uint32_t dc0, dc1, dc2, dc3;
+    int i;
+
+    /* First do :
+          s0 s1
+       s2
+       s3
+    */
+    for( i = 0; i < 4; i++ )
+    {
+        s0 += src[i - i_stride];
+        s1 += src[i + 4 - i_stride];
+        s2 += src[-1 + i * i_stride];
+        s3 += src[-1 + (i+4)*i_stride];
+    }
+    /* now calculate
+       dc0 dc1
+       dc2 dc3
+     */
+    dc0 = (( s0 + s2 + 4 ) >> 3)*0x01010101;
+    dc1 = (( s1 + 2 ) >> 2)*0x01010101;
+    dc2 = (( s3 + 2 ) >> 2)*0x01010101;
+    dc3 = (( s1 + s3 + 4 ) >> 3)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc2;
+        *p++ = dc3;
+
+        src += i_stride;
+    }
+}
+
+static void predict_8x8_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 8; i++ )
+    {
+        uint32_t v = 0x01010101 * src[-1];
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = v;
+        *p++ = v;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_v( uint8_t *src, int i_stride )
+{
+    int i;
+
+    asm volatile( "movq  (%0), %%mm0\n" :: "r"(&src[-i_stride]) );
+
+    for( i = 0; i < 8; i++ )
+    {
+        asm volatile( "movq %%mm0,  (%0)\n" :: "r"(src) );
+        src += i_stride;
+    }
+}
+
+
+/****************************************************************************
+ * 4x4 prediction for intra luma block DC, H, V, P
+ ****************************************************************************/
+static void predict_4x4_dc_128( uint8_t *src, int i_stride )
+{
+    int y;
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
+                     src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_top( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[0 - i_stride] + src[1 - i_stride] +
+                     src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
+                     src[-1+2*i_stride] + src[-1+3*i_stride] +
+                     src[0 - i_stride]  + src[1 - i_stride] +
+                     src[2 - i_stride]  + src[3 - i_stride] + 4 ) >> 3)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = 0x01010101*src[-1];
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_v( uint8_t *src, int i_stride )
+{
+    uint32_t top = *((uint32_t*)&src[-i_stride]);
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p = top;
+
+        src += i_stride;
+    }
+}
+
+/****************************************************************************
+ * Exported functions:
+ ****************************************************************************/
+void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
+{
+    pf[I_PRED_16x16_V ]     = predict_16x16_v;
+    pf[I_PRED_16x16_H ]     = predict_16x16_h;
+    pf[I_PRED_16x16_DC]     = predict_16x16_dc;
+    pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
+    pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top;
+    pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128;
+}
+
+void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = predict_8x8_v;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+}
+
+void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] )
+{
+    pf[I_PRED_4x4_V]      = predict_4x4_v;
+    pf[I_PRED_4x4_H]      = predict_4x4_h;
+    pf[I_PRED_4x4_DC]     = predict_4x4_dc;
+    pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left;
+    pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top;
+    pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128;
+}
+
diff --git a/core/i386/predict.h b/core/i386/predict.h
new file mode 100644
index 00000000..b00b1e59
--- /dev/null
+++ b/core/i386/predict.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * predict.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_PREDICT_H
+#define _I386_PREDICT_H 1
+
+void x264_predict_16x16_init_mmxext ( x264_predict_t pf[7] );
+void x264_predict_8x8_init_mmxext   ( x264_predict_t pf[7] );
+void x264_predict_4x4_init_mmxext   ( x264_predict_t pf[12] );
+
+#endif
diff --git a/core/macroblock.c b/core/macroblock.c
new file mode 100644
index 00000000..59603f03
--- /dev/null
+++ b/core/macroblock.c
@@ -0,0 +1,1029 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "common.h"
+#include "macroblock.h"
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int dequant_mf[6][4][4] =
+{
+    { {10, 13, 10, 13}, {13, 16, 13, 16}, {10, 13, 10, 13}, {13, 16, 13, 16} },
+    { {11, 14, 11, 14}, {14, 18, 14, 18}, {11, 14, 11, 14}, {14, 18, 14, 18} },
+    { {13, 16, 13, 16}, {16, 20, 16, 20}, {13, 16, 13, 16}, {16, 20, 16, 20} },
+    { {14, 18, 14, 18}, {18, 23, 18, 23}, {14, 18, 14, 18}, {18, 23, 18, 23} },
+    { {16, 20, 16, 20}, {20, 25, 20, 25}, {16, 20, 16, 20}, {20, 25, 20, 25} },
+    { {18, 23, 18, 23}, {23, 29, 23, 29}, {18, 23, 18, 23}, {23, 29, 23, 29} }
+};
+
+#if 0
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+#endif
+
+int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
+{
+    const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
+    const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
+    const int m  = X264_MIN( ma, mb );
+
+    if( m < 0 )
+        return I_PRED_4x4_DC;
+
+    return m;
+}
+
+int x264_mb_predict_non_zero_code( x264_t *h, int idx )
+{
+    const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
+    const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];
+
+    int i_ret = za + zb;
+
+    if( i_ret < 0x80 )
+    {
+        i_ret = ( i_ret + 1 ) >> 1;
+    }
+    return i_ret & 0x7f;
+}
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int i_qscale )
+{
+    const int i_qbits = i_qscale/6 - 1;
+
+    if( i_qbits >= 0 )
+    {
+        const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
+
+        dct[0][0] = dct[0][0] * i_dmf;
+        dct[0][1] = dct[0][1] * i_dmf;
+        dct[1][0] = dct[1][0] * i_dmf;
+        dct[1][1] = dct[1][1] * i_dmf;
+    }
+    else
+    {
+        const int i_dmf = dequant_mf[i_qscale%6][0][0];
+
+        dct[0][0] = ( dct[0][0] * i_dmf ) >> 1;
+        dct[0][1] = ( dct[0][1] * i_dmf ) >> 1;
+        dct[1][0] = ( dct[1][0] * i_dmf ) >> 1;
+        dct[1][1] = ( dct[1][1] * i_dmf ) >> 1;
+    }
+}
+
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = i_qscale/6 - 2;
+    int x,y;
+
+    if( i_qbits >= 0 )
+    {
+        const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
+
+        for( y = 0; y < 4; y++ )
+        {
+            for( x = 0; x < 4; x++ )
+            {
+                dct[y][x] = dct[y][x] * i_dmf;
+            }
+        }
+    }
+    else
+    {
+        const int i_dmf = dequant_mf[i_qscale%6][0][0];
+        const int f = 1 << ( 1 + i_qbits );
+
+        for( y = 0; y < 4; y++ )
+        {
+            for( x = 0; x < 4; x++ )
+            {
+                dct[y][x] = ( dct[y][x] * i_dmf + f ) >> (-i_qbits);
+            }
+        }
+    }
+}
+
+void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale )
+{
+    const int i_mf = i_qscale%6;
+    const int i_qbits = i_qscale/6;
+    int y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] ) << i_qbits;
+        dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] ) << i_qbits;
+        dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] ) << i_qbits;
+        dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] ) << i_qbits;
+    }
+}
+
+static inline int x264_median( int a, int b, int c )
+{
+    int min = a, max =a;
+    if( b < min )
+        min = b;
+    else
+        max = b;    /* no need to do 'b > max' (more consuming than always doing affectation) */
+
+    if( c < min )
+        min = c;
+    else if( c > max )
+        max = c;
+
+    return a + b + c - min - max;
+}
+
+void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] )
+{
+    const int i8 = x264_scan8[idx];
+    const int i_ref= h->mb.cache.ref[i_list][i8];
+    int     i_refa = h->mb.cache.ref[i_list][i8 - 1];
+    int16_t *mv_a  = h->mb.cache.mv[i_list][i8 - 1];
+    int     i_refb = h->mb.cache.ref[i_list][i8 - 8];
+    int16_t *mv_b  = h->mb.cache.mv[i_list][i8 - 8];
+    int     i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width ];
+    int16_t *mv_c  = h->mb.cache.mv[i_list][i8 - 8 + i_width];
+
+    int i_count;
+
+    if( (idx&0x03) == 3 || ( i_width == 2 && (idx&0x3) == 2 )|| i_refc == -2 )
+    {
+        i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
+        mv_c   = h->mb.cache.mv[i_list][i8 - 8 - 1];
+    }
+
+    if( h->mb.i_partition == D_16x8 )
+    {
+        if( idx == 0 && i_refb == i_ref )
+        {
+            mvp[0] = mv_b[0];
+            mvp[1] = mv_b[1];
+            return;
+        }
+        else if( idx != 0 && i_refa == i_ref )
+        {
+            mvp[0] = mv_a[0];
+            mvp[1] = mv_a[1];
+            return;
+        }
+    }
+    else if( h->mb.i_partition == D_8x16 )
+    {
+        if( idx == 0 && i_refa == i_ref )
+        {
+            mvp[0] = mv_a[0];
+            mvp[1] = mv_a[1];
+            return;
+        }
+        else if( idx != 0 && i_refc == i_ref )
+        {
+            mvp[0] = mv_c[0];
+            mvp[1] = mv_c[1];
+            return;
+        }
+    }
+
+    i_count = 0;
+    if( i_refa == i_ref ) i_count++;
+    if( i_refb == i_ref ) i_count++;
+    if( i_refc == i_ref ) i_count++;
+
+    if( i_count > 1 )
+    {
+        mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
+        mvp[1] = x264_median( mv_a[1], mv_b[1], mv_c[1] );
+    }
+    else if( i_count == 1 )
+    {
+        if( i_refa == i_ref )
+        {
+            mvp[0] = mv_a[0];
+            mvp[1] = mv_a[1];
+        }
+        else if( i_refb == i_ref )
+        {
+            mvp[0] = mv_b[0];
+            mvp[1] = mv_b[1];
+        }
+        else
+        {
+            mvp[0] = mv_c[0];
+            mvp[1] = mv_c[1];
+        }
+    }
+    else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
+    {
+        mvp[0] = mv_a[0];
+        mvp[1] = mv_a[1];
+    }
+    else
+    {
+        mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
+        mvp[1] = x264_median( mv_a[1], mv_b[1], mv_c[1] );
+    }
+}
+
+void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] )
+{
+    int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
+    int16_t *mv_a  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
+    int     i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
+    int16_t *mv_b  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
+    int     i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
+    int16_t *mv_c  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
+
+    int i_count;
+
+    if( i_refc == -2 )
+    {
+        i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
+        mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
+    }
+
+    i_count = 0;
+    if( i_refa == i_ref ) i_count++;
+    if( i_refb == i_ref ) i_count++;
+    if( i_refc == i_ref ) i_count++;
+
+    if( i_count > 1 )
+    {
+        mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
+        mvp[1] = x264_median( mv_a[1], mv_b[1], mv_c[1] );
+    }
+    else if( i_count == 1 )
+    {
+        if( i_refa == i_ref )
+        {
+            mvp[0] = mv_a[0];
+            mvp[1] = mv_a[1];
+        }
+        else if( i_refb == i_ref )
+        {
+            mvp[0] = mv_b[0];
+            mvp[1] = mv_b[1];
+        }
+        else
+        {
+            mvp[0] = mv_c[0];
+            mvp[1] = mv_c[1];
+        }
+    }
+    else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
+    {
+        mvp[0] = mv_a[0];
+        mvp[1] = mv_a[1];
+    }
+    else
+    {
+        mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
+        mvp[1] = x264_median( mv_a[1], mv_b[1], mv_c[1] );
+    }
+}
+
+
+void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] )
+{
+    int     i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1];
+    int     i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8];
+    int16_t *mv_a  = h->mb.cache.mv[0][X264_SCAN8_0 - 1];
+    int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
+
+    if( i_refa == -2 || i_refb == -2 ||
+        ( i_refa == 0 && mv_a[0] == 0 && mv_a[1] == 0 ) ||
+        ( i_refb == 0 && mv_b[0] == 0 && mv_b[1] == 0 ) )
+    {
+        mv[0] = mv[1] = 0;
+    }
+    else
+    {
+        x264_mb_predict_mv_16x16( h, 0, 0, mv );
+    }
+}
+
+static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
+{
+    const int i8 = x264_scan8[0]+x+8*y;
+    const int i_ref = h->mb.cache.ref[0][i8];
+    const int mvx   = h->mb.cache.mv[0][i8][0];
+    const int mvy   = h->mb.cache.mv[0][i8][1];
+
+    h->mc[MC_LUMA]( &h->mb.pic.p_fref[0][i_ref][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    &h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x],           h->mb.pic.i_stride[0],
+                    mvx, mvy, 4*width, 4*height );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                      &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x],           h->mb.pic.i_stride[1],
+                      mvx, mvy, 2*width, 2*height );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                      &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x],           h->mb.pic.i_stride[2],
+                      mvx, mvy, 2*width, 2*height );
+}
+static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
+{
+    const int i8 = x264_scan8[0]+x+8*y;
+    const int i_ref = h->mb.cache.ref[1][i8];
+    const int mvx   = h->mb.cache.mv[1][i8][0];
+    const int mvy   = h->mb.cache.mv[1][i8][1];
+
+    h->mc[MC_LUMA]( &h->mb.pic.p_fref[1][i_ref][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x],            h->mb.pic.i_stride[0],
+                    mvx, mvy, 4*width, 4*height );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                      &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x],           h->mb.pic.i_stride[1],
+                      mvx, mvy, 2*width, 2*height );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                      &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x],           h->mb.pic.i_stride[2],
+                      mvx, mvy, 2*width, 2*height );
+}
+
+static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
+{
+    const int i8 = x264_scan8[0]+x+8*y;
+
+    const int i_ref0 = h->mb.cache.ref[0][i8];
+    const int mvx0   = h->mb.cache.mv[0][i8][0];
+    const int mvy0   = h->mb.cache.mv[0][i8][1];
+
+    const int i_ref1 = h->mb.cache.ref[1][i8];
+    const int mvx1   = h->mb.cache.mv[1][i8][0];
+    const int mvy1   = h->mb.cache.mv[1][i8][1];
+    DECLARE_ALIGNED( uint8_t, tmp[16*16], 16 );
+    int     i_mode = 0;
+
+    if( width == 4 && height == 4 ) i_mode = PIXEL_16x16;
+    else if( width == 4 && height == 2 ) i_mode = PIXEL_16x8;
+    else if( width == 2 && height == 4 ) i_mode = PIXEL_8x16;
+    else if( width == 2 && height == 2 ) i_mode = PIXEL_8x8;
+    else if( width == 2 && height == 1 ) i_mode = PIXEL_8x4;
+    else if( width == 1 && height == 2 ) i_mode = PIXEL_4x8;
+    else if( width == 1 && height == 1 ) i_mode = PIXEL_4x4;
+
+    h->mc[MC_LUMA]( &h->mb.pic.p_fref[0][i_ref0][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x],             h->mb.pic.i_stride[0],
+                    mvx0, mvy0, 4*width, 4*height );
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref0][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                      &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x],            h->mb.pic.i_stride[1],
+                      mvx0, mvy0, 2*width, 2*height );
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref0][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                      &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x],            h->mb.pic.i_stride[2],
+                      mvx0, mvy0, 2*width, 2*height );
+
+
+    h->mc[MC_LUMA]( &h->mb.pic.p_fref[1][i_ref1][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    tmp, 16, mvx1, mvy1, 4*width, 4*height );
+    h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0], tmp, 16 );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref1][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                      tmp, 16, mvx1, mvy1, 2*width, 2*height );
+    h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], tmp, 16 );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref1][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                      tmp, 16, mvx1, mvy1, 2*width, 2*height );
+    h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], tmp, 16 );
+}
+
+
+void x264_mb_mc( x264_t *h )
+{
+    if( h->mb.i_type == P_L0 )
+    {
+        if( h->mb.i_partition == D_16x16 )
+        {
+            x264_mb_mc_0xywh( h, 0, 0, 4, 4 );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            x264_mb_mc_0xywh( h, 0, 0, 4, 2 );
+            x264_mb_mc_0xywh( h, 0, 2, 4, 2 );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            x264_mb_mc_0xywh( h, 0, 0, 2, 4 );
+            x264_mb_mc_0xywh( h, 2, 0, 2, 4 );
+        }
+    }
+    else if( h->mb.i_type == P_8x8 )
+    {
+        int i;
+        for( i = 0; i < 4; i++ )
+        {
+            const int x = 2*(i%2);
+            const int y = 2*(i/2);
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    x264_mb_mc_0xywh( h, x, y, 2, 2 );
+                    break;
+                case D_L0_8x4:
+                    x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
+                    x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
+                    break;
+                case D_L0_4x8:
+                    x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
+                    x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
+                    break;
+                case D_L0_4x4:
+                    x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
+                    x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
+                    x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
+                    x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
+                    break;
+            }
+        }
+    }
+    else if( h->mb.i_type == B_8x8 || h->mb.i_type == B_DIRECT )
+    {
+        fprintf( stderr, "mc_luma with unsupported mb\n" );
+        return;
+    }
+    else    /* B_*x* */
+    {
+        int b_list0[2];
+        int b_list1[2];
+
+        int i;
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list0[i] = x264_mb_type_list0_table[h->mb.i_type][i];
+            b_list1[i] = x264_mb_type_list1_table[h->mb.i_type][i];
+        }
+        if( h->mb.i_partition == D_16x16 )
+        {
+            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
+            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 4, 4 );
+            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 4, 4 );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 2 );
+            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 4, 2 );
+            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 4, 2 );
+
+            if( b_list0[1] && b_list1[1] ) x264_mb_mc_01xywh( h, 0, 2, 4, 2 );
+            else if( b_list0[1] )          x264_mb_mc_0xywh ( h, 0, 2, 4, 2 );
+            else if( b_list1[1] )          x264_mb_mc_1xywh ( h, 0, 2, 4, 2 );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 2, 4 );
+            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 2, 4 );
+            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 2, 4 );
+
+            if( b_list0[1] && b_list1[1] ) x264_mb_mc_01xywh( h, 2, 0, 2, 4 );
+            else if( b_list0[1] )          x264_mb_mc_0xywh ( h, 2, 0, 2, 4 );
+            else if( b_list1[1] )          x264_mb_mc_1xywh ( h, 2, 0, 2, 4 );
+        }
+    }
+}
+
+void x264_macroblock_cache_init( x264_t *h )
+{
+    int i_mb_count  = h->sps->i_mb_width * h->sps->i_mb_height;
+
+    h->mb.i_mb_stride = h->sps->i_mb_width;
+
+    h->mb.type= x264_malloc( i_mb_count * sizeof( int8_t) );
+    h->mb.qp  = x264_malloc( i_mb_count * sizeof( int8_t) );
+    h->mb.cbp = x264_malloc( i_mb_count * sizeof( int16_t) );
+
+    /* 0 -> 3 top(4), 4 -> 6 : left(3) */
+    h->mb.intra4x4_pred_mode = x264_malloc( i_mb_count * 7 * sizeof( int8_t ) );
+
+    /* all coeffs */
+    h->mb.non_zero_count = x264_malloc( i_mb_count * 24 * sizeof( uint8_t ) );
+
+    h->mb.mv[0]  = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+    h->mb.mv[1]  = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+    h->mb.ref[0] = x264_malloc( 4 * i_mb_count * sizeof( int16_t ) );
+    h->mb.ref[1] = x264_malloc( 4 * i_mb_count * sizeof( int16_t ) );
+
+    if( h->param.b_cabac )
+    {
+        h->mb.chroma_pred_mode = x264_malloc( i_mb_count * sizeof( int8_t) );
+        h->mb.mvd[0] = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+        h->mb.mvd[1] = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+    }
+
+    /* init with not avaiable (for top right idx=7,15) */
+    memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
+    memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
+}
+void x264_macroblock_cache_end( x264_t *h )
+{
+    if( h->param.b_cabac )
+    {
+        x264_free( h->mb.chroma_pred_mode );
+        x264_free( h->mb.mvd[0] );
+        x264_free( h->mb.mvd[1] );
+    }
+    x264_free( h->mb.mv[0] );
+    x264_free( h->mb.mv[1] );
+    x264_free( h->mb.ref[0] );
+    x264_free( h->mb.ref[1] );
+    x264_free( h->mb.intra4x4_pred_mode );
+    x264_free( h->mb.non_zero_count );
+    x264_free( h->mb.cbp );
+    x264_free( h->mb.qp );
+    x264_free( h->mb.type );
+}
+
+
+void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
+{
+    const int i_mb_4x4 = 16 * h->mb.i_mb_stride *i_mb_y + 4 * i_mb_x;
+    const int i_mb_8x8 =  4 * h->mb.i_mb_stride *i_mb_y + 2 * i_mb_x;
+
+    int i_top_xy = -1;
+    int i_left_xy = -1;
+    int i_top_type = -1;    /* gcc warn */
+    int i_left_type= -1;
+
+    int i;
+
+    /* init index */
+    h->mb.i_mb_x = i_mb_x;
+    h->mb.i_mb_y = i_mb_y;
+    h->mb.i_mb_xy = i_mb_y * h->mb.i_mb_stride + i_mb_x;
+    h->mb.i_neighbour = 0;
+
+    /* load picture pointers */
+    for( i = 0; i < 3; i++ )
+    {
+        const int w = (i == 0 ? 16 : 8);
+        const int i_stride = h->fdec->i_stride[i];
+        int   j;
+
+        h->mb.pic.i_stride[i] = i_stride;
+
+        h->mb.pic.p_fenc[i] = &h->fenc->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+
+        h->mb.pic.p_fdec[i] = &h->fdec->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+
+        for( j = 0; j < h->i_ref0; j++ )
+        {
+            h->mb.pic.p_fref[0][j][i] = &h->fref0[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+        }
+        for( j = 0; j < h->i_ref1; j++ )
+        {
+            h->mb.pic.p_fref[1][j][i] = &h->fref1[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+        }
+    }
+
+    /* load cache */
+    if( i_mb_y > 0 )
+    {
+        i_top_xy  = h->mb.i_mb_xy - h->mb.i_mb_stride;
+        i_top_type= h->mb.type[i_top_xy];
+
+        h->mb.i_neighbour |= MB_TOP;
+
+        /* load intra4x4 */
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][0];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][1];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][2];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][3];
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[0] - 8] = h->mb.non_zero_count[i_top_xy][10];
+        h->mb.cache.non_zero_count[x264_scan8[1] - 8] = h->mb.non_zero_count[i_top_xy][11];
+        h->mb.cache.non_zero_count[x264_scan8[4] - 8] = h->mb.non_zero_count[i_top_xy][14];
+        h->mb.cache.non_zero_count[x264_scan8[5] - 8] = h->mb.non_zero_count[i_top_xy][15];
+
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] = h->mb.non_zero_count[i_top_xy][16+2];
+        h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] = h->mb.non_zero_count[i_top_xy][16+3];
+
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] = h->mb.non_zero_count[i_top_xy][16+4+2];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = h->mb.non_zero_count[i_top_xy][16+4+3];
+    }
+    else
+    {
+        /* load intra4x4 */
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = -1;
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[0] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[1] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[4] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[5] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = 0x80;
+
+    }
+
+    if( i_mb_x > 0 )
+    {
+        i_left_xy  = h->mb.i_mb_xy - 1;
+        i_left_type= h->mb.type[i_left_xy];
+
+        h->mb.i_neighbour |= MB_LEFT;
+
+        /* load intra4x4 */
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][4];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][5];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][6];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][3];
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][5];
+        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[i_left_xy][7];
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][13];
+        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[i_left_xy][15];
+
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[i_left_xy][16+1];
+        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = h->mb.non_zero_count[i_left_xy][16+3];
+
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = h->mb.non_zero_count[i_left_xy][16+4+1];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = h->mb.non_zero_count[i_left_xy][16+4+3];
+    }
+    else
+    {
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = -1;
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[10] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = 0x80;
+    }
+
+    if( i_mb_y > 0 && i_mb_x < h->sps->i_mb_width - 1 )
+    {
+        h->mb.i_neighbour |= MB_TOPRIGHT;
+    }
+
+    /* load ref/mv/mvd */
+    if( h->sh.i_type != SLICE_TYPE_I )
+    {
+        int s8x8 = 2 * h->mb.i_mb_stride;
+        int s4x4 = 4 * h->mb.i_mb_stride;
+
+        int i_top_left_xy   = -1;
+        int i_top_right_xy  = -1;
+
+        int i_list;
+
+        if( h->mb.i_mb_y > 0 && h->mb.i_mb_x > 0 )
+        {
+            i_top_left_xy   = i_top_xy - 1;
+        }
+        if( h->mb.i_mb_y > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 )
+        {
+            i_top_right_xy = i_top_xy + 1;
+        }
+
+        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_P ? 1  : 2 ); i_list++ )
+        {
+            /*
+            h->mb.cache.ref[i_list][x264_scan8[5 ]+1] =
+            h->mb.cache.ref[i_list][x264_scan8[7 ]+1] =
+            h->mb.cache.ref[i_list][x264_scan8[13]+1] = -2;
+            */
+
+            if( i_top_left_xy >= 0 )
+            {
+                const int i8 = x264_scan8[0] - 1 - 1*8;
+                const int ir = i_mb_8x8 - s8x8 - 1;
+                const int iv = i_mb_4x4 - s4x4 - 1;
+                h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
+                h->mb.cache.mv[i_list][i8][0] = h->mb.mv[i_list][iv][0];
+                h->mb.cache.mv[i_list][i8][1] = h->mb.mv[i_list][iv][1];
+            }
+            else
+            {
+                const int i8 = x264_scan8[0] - 1 - 1*8;
+                h->mb.cache.ref[i_list][i8] = -2;
+                h->mb.cache.mv[i_list][i8][0] = 0;
+                h->mb.cache.mv[i_list][i8][1] = 0;
+            }
+
+            if( i_top_xy >= 0 )
+            {
+                const int i8 = x264_scan8[0] - 8;
+                const int ir = i_mb_8x8 - s8x8;
+                const int iv = i_mb_4x4 - s4x4;
+
+                h->mb.cache.ref[i_list][i8+0] =
+                h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0];
+                h->mb.cache.ref[i_list][i8+2] =
+                h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1];
+
+                for( i = 0; i < 4; i++ )
+                {
+                    h->mb.cache.mv[i_list][i8+i][0] = h->mb.mv[i_list][iv + i][0];
+                    h->mb.cache.mv[i_list][i8+i][1] = h->mb.mv[i_list][iv + i][1];
+                }
+            }
+            else
+            {
+                const int i8 = x264_scan8[0] - 8;
+                for( i = 0; i < 4; i++ )
+                {
+                    h->mb.cache.ref[i_list][i8+i] = -2;
+                    h->mb.cache.mv[i_list][i8+i][0] =
+                    h->mb.cache.mv[i_list][i8+i][1] = 0;
+                }
+            }
+
+            if( i_top_right_xy >= 0 )
+            {
+                const int i8 = x264_scan8[0] + 4 - 1*8;
+                const int ir = i_mb_8x8 - s8x8 + 2;
+                const int iv = i_mb_4x4 - s4x4 + 4;
+
+                h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
+                h->mb.cache.mv[i_list][i8][0] = h->mb.mv[i_list][iv][0];
+                h->mb.cache.mv[i_list][i8][1] = h->mb.mv[i_list][iv][1];
+            }
+            else
+            {
+                const int i8 = x264_scan8[0] + 4 - 1*8;
+                h->mb.cache.ref[i_list][i8] = -2;
+                h->mb.cache.mv[i_list][i8][0] = 0;
+                h->mb.cache.mv[i_list][i8][1] = 0;
+            }
+
+            if( i_left_xy >= 0 )
+            {
+                const int i8 = x264_scan8[0] - 1;
+                const int ir = i_mb_8x8 - 1;
+                const int iv = i_mb_4x4 - 1;
+
+                h->mb.cache.ref[i_list][i8+0*8] =
+                h->mb.cache.ref[i_list][i8+1*8] = h->mb.ref[i_list][ir + 0*s8x8];
+                h->mb.cache.ref[i_list][i8+2*8] =
+                h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8];
+
+                for( i = 0; i < 4; i++ )
+                {
+                    h->mb.cache.mv[i_list][i8+i*8][0] = h->mb.mv[i_list][iv + i*s4x4][0];
+                    h->mb.cache.mv[i_list][i8+i*8][1] = h->mb.mv[i_list][iv + i*s4x4][1];
+                }
+            }
+            else
+            {
+                const int i8 = x264_scan8[0] - 1;
+                for( i = 0; i < 4; i++ )
+                {
+                    h->mb.cache.ref[i_list][i8+i*8] = -2;
+                    h->mb.cache.mv[i_list][i8+i*8][0] =
+                    h->mb.cache.mv[i_list][i8+i*8][1] = 0;
+                }
+            }
+
+            if( h->param.b_cabac )
+            {
+                if( i_top_xy >= 0 )
+                {
+                    const int i8 = x264_scan8[0] - 8;
+                    const int iv = i_mb_4x4 - s4x4;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        h->mb.cache.mvd[i_list][i8+i][0] = h->mb.mvd[i_list][iv + i][0];
+                        h->mb.cache.mvd[i_list][i8+i][1] = h->mb.mvd[i_list][iv + i][1];
+                    }
+                }
+                else
+                {
+                    const int i8 = x264_scan8[0] - 8;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        h->mb.cache.mvd[i_list][i8+i][0] =
+                        h->mb.cache.mvd[i_list][i8+i][1] = 0;
+                    }
+                }
+
+                if( i_left_xy >= 0 )
+                {
+                    const int i8 = x264_scan8[0] - 1;
+                    const int iv = i_mb_4x4 - 1;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        h->mb.cache.mvd[i_list][i8+i*8][0] = h->mb.mvd[i_list][iv + i*s4x4][0];
+                        h->mb.cache.mvd[i_list][i8+i*8][1] = h->mb.mvd[i_list][iv + i*s4x4][1];
+                    }
+                }
+                else
+                {
+                    const int i8 = x264_scan8[0] - 1;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        h->mb.cache.mvd[i_list][i8+i*8][0] =
+                        h->mb.cache.mvd[i_list][i8+i*8][1] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void x264_macroblock_cache_save( x264_t *h )
+{
+    const int i_mb_xy = h->mb.i_mb_xy;
+    const int i_mb_type = h->mb.i_type;
+    const int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
+    const int i_mb_8x8 =  4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
+
+    int i;
+
+    h->mb.i_last_dqp = h->mb.qp[i_mb_xy] - h->mb.i_last_qp;
+    h->mb.i_last_qp = h->mb.qp[i_mb_xy];
+
+    /* save intra4x4 */
+    if( i_mb_type == I_4x4 )
+    {
+        h->mb.intra4x4_pred_mode[i_mb_xy][0] = h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[11] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][2] = h->mb.cache.intra4x4_pred_mode[x264_scan8[14] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][3] = h->mb.cache.intra4x4_pred_mode[x264_scan8[15] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][4] = h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][5] = h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][6] = h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ];
+    }
+    else
+    {
+        h->mb.intra4x4_pred_mode[i_mb_xy][0] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][1] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][2] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][3] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][4] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][5] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][6] = I_PRED_4x4_DC;
+    }
+
+    if( i_mb_type == I_PCM )
+    {
+        h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
+        for( i = 0; i < 16 + 2*4; i++ )
+        {
+            h->mb.non_zero_count[i_mb_xy][i] = 16;
+        }
+    }
+    else
+    {
+        /* save non zero count */
+        for( i = 0; i < 16 + 2*4; i++ )
+        {
+            h->mb.non_zero_count[i_mb_xy][i] = h->mb.cache.non_zero_count[x264_scan8[i]];
+        }
+    }
+
+    if( !IS_INTRA( i_mb_type ) )
+    {
+        int i_list;
+        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_P ? 1  : 2 ); i_list++ )
+        {
+            const int s8x8 = 2 * h->mb.i_mb_stride;
+            const int s4x4 = 4 * h->mb.i_mb_stride;
+            int y,x;
+
+            h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[0]];
+            h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[4]];
+            h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[8]];
+            h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[12]];
+
+            for( y = 0; y < 4; y++ )
+            {
+                for( x = 0; x < 4; x++ )
+                {
+                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][0] = h->mb.cache.mv[i_list][x264_scan8[0]+x+8*y][0];
+                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][1] = h->mb.cache.mv[i_list][x264_scan8[0]+x+8*y][1];
+                }
+            }
+        }
+    }
+    else
+    {
+        int i_list;
+        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_P ? 1  : 2 ); i_list++ )
+        {
+            const int s8x8 = 2 * h->mb.i_mb_stride;
+            const int s4x4 = 4 * h->mb.i_mb_stride;
+            int y,x;
+
+            h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] =
+            h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] =
+            h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] =
+            h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = -1;
+
+            for( y = 0; y < 4; y++ )
+            {
+                for( x = 0; x < 4; x++ )
+                {
+                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][0] = 0;
+                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][1] = 0;
+                }
+            }
+        }
+    }
+
+    if( h->param.b_cabac )
+    {
+        if( i_mb_type == I_4x4 || i_mb_type == I_16x16 )
+            h->mb.chroma_pred_mode[i_mb_xy] = h->mb.i_chroma_pred_mode;
+        else
+            h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
+
+        if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) )
+        {
+            int i_list;
+            for( i_list  = 0; i_list < 2; i_list++ )
+            {
+                const int s4x4 = 4 * h->mb.i_mb_stride;
+                int y,x;
+                for( y = 0; y < 4; y++ )
+                {
+                    for( x = 0; x < 4; x++ )
+                    {
+                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][0] = h->mb.cache.mvd[i_list][x264_scan8[0]+x+8*y][0];
+                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][1] = h->mb.cache.mvd[i_list][x264_scan8[0]+x+8*y][1];
+                    }
+                }
+            }
+        }
+        else
+        {
+            int i_list;
+            for( i_list  = 0; i_list < 2; i_list++ )
+            {
+                const int s4x4 = 4 * h->mb.i_mb_stride;
+                int y,x;
+                for( y = 0; y < 4; y++ )
+                {
+                    for( x = 0; x < 4; x++ )
+                    {
+                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][0] = 0;
+                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][1] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
diff --git a/core/macroblock.h b/core/macroblock.h
new file mode 100644
index 00000000..5419f87e
--- /dev/null
+++ b/core/macroblock.h
@@ -0,0 +1,204 @@
+/*****************************************************************************
+ * macroblock.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _MACROBLOCK_H
+#define _MACROBLOCK_H 1
+
+enum macroblock_position_e
+{
+    MB_LEFT     = 0x01,
+    MB_TOP      = 0x02,
+    MB_TOPRIGHT = 0x04,
+
+    MB_PRIVATE  = 0x10,
+};
+
+
+/* XXX mb_type isn't the one written in the bitstream -> only internal usage */
+#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_16x16 )
+#define IS_SKIP(type)  ( (type) == P_SKIP || (type) == B_SKIP )
+enum mb_class_e
+{
+    I_4x4           = 0,
+    I_16x16         = 1,
+    I_PCM           = 2,
+
+    P_L0            = 3,
+    P_8x8           = 4,
+    P_SKIP          = 5,
+
+    B_DIRECT        = 6,
+    B_L0_L0         = 7,
+    B_L0_L1         = 8,
+    B_L0_BI         = 9,
+    B_L1_L0         = 10,
+    B_L1_L1         = 11,
+    B_L1_BI         = 12,
+    B_BI_L0         = 13,
+    B_BI_L1         = 14,
+    B_BI_BI         = 15,
+    B_8x8           = 16,
+    B_SKIP          = 17,
+};
+static const int x264_mb_type_list0_table[18][2] =
+{
+    {0,0}, {0,0}, {0,0},    /* INTRA */
+    {1,1},                  /* P_L0 */
+    {0,0},                  /* P_8x8 */
+    {1,1},                  /* P_SKIP */
+    {0,0},                  /* B_DIRECT */
+    {1,1}, {1,0}, {1,1},    /* B_L0_* */
+    {0,1}, {0,0}, {0,1},    /* B_L1_* */
+    {1,1}, {1,0}, {1,1},    /* B_BI_* */
+    {0,0},                  /* B_8x8 */
+    {0,0}                   /* B_SKIP */
+};
+static const int x264_mb_type_list1_table[18][2] =
+{
+    {0,0}, {0,0}, {0,0},    /* INTRA */
+    {0,0},                  /* P_L0 */
+    {0,0},                  /* P_8x8 */
+    {0,0},                  /* P_SKIP */
+    {0,0},                  /* B_DIRECT */
+    {0,0}, {0,1}, {0,1},    /* B_L0_* */
+    {1,0}, {1,1}, {1,1},    /* B_L1_* */
+    {1,0}, {1,1}, {1,1},    /* B_BI_* */
+    {0,0},                  /* B_8x8 */
+    {0,0}                   /* B_SKIP */
+};
+
+#define IS_SUB4x4(type) ( (type ==D_L0_4x4)||(type ==D_L1_4x4)||(type ==D_BI_4x4))
+#define IS_SUB4x8(type) ( (type ==D_L0_4x8)||(type ==D_L1_4x8)||(type ==D_BI_4x8))
+#define IS_SUB8x4(type) ( (type ==D_L0_8x4)||(type ==D_L1_8x4)||(type ==D_BI_8x4))
+#define IS_SUB8x8(type) ( (type ==D_L0_8x8)||(type ==D_L1_8x8)||(type ==D_BI_8x8)||(type ==D_DIRECT_8x8))
+enum mb_partition_e
+{
+    /* sub partition type for P_8x8 and B_8x8 */
+    D_L0_4x4        = 0,
+    D_L0_8x4        = 1,
+    D_L0_4x8        = 2,
+    D_L0_8x8        = 3,
+
+    /* sub partition type for B_8x8 only */
+    D_L1_4x4        = 4,
+    D_L1_8x4        = 5,
+    D_L1_4x8        = 6,
+    D_L1_8x8        = 7,
+
+    D_BI_4x4        = 8,
+    D_BI_8x4        = 9,
+    D_BI_4x8        = 10,
+    D_BI_8x8        = 11,
+    D_DIRECT_8x8    = 12,
+
+    /* partition */
+    D_8x8           = 13,
+    D_16x8          = 14,
+    D_8x16          = 15,
+    D_16x16         = 16,
+};
+
+static const int x264_mb_partition_count_table[17] =
+{
+    /* sub L0 */
+    4, 2, 2, 1,
+    /* sub L1 */
+    4, 2, 2, 1,
+    /* sub BI */
+    4, 2, 2, 1,
+    /* Direct */
+    1,
+    /* Partition */
+    4, 2, 2, 1
+};
+
+void x264_macroblock_cache_init( x264_t *h );
+void x264_macroblock_cache_load( x264_t *h, int, int );
+void x264_macroblock_cache_save( x264_t *h );
+void x264_macroblock_cache_end( x264_t *h );
+
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int i_qscale );
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int i_qscale );
+void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale );
+
+/* x264_mb_predict_mv_16x16:
+ *      set mvp with predicted mv for D_16x16 block
+ *      h->mb. need only valid values from others block */
+void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] );
+/* x264_mb_predict_mv_pskip:
+ *      set mvp with predicted mv for P_SKIP
+ *      h->mb. need only valid values from others block */
+void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] );
+/* x264_mb_predict_mv:
+ *      set mvp with predicted mv for all blocks except P_SKIP
+ *      h->mb. need valid ref/partition/sub of current block to be valid
+ *      and valid mv/ref from others block . */
+void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] );
+
+
+int  x264_mb_predict_intra4x4_mode( x264_t *h, int idx );
+int  x264_mb_predict_non_zero_code( x264_t *h, int idx );
+
+void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale );
+
+void x264_mb_mc( x264_t *h );
+
+
+static inline void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int ref )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.ref[i_list][X264_SCAN8_0+x+dx+8*(y+dy)] = ref;
+        }
+    }
+}
+static inline void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, int mvx, int mvy )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.mv[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][0] = mvx;
+            h->mb.cache.mv[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][1] = mvy;
+        }
+    }
+}
+static inline void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, int mdx, int mdy )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.mvd[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][0] = mdx;
+            h->mb.cache.mvd[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][1] = mdy;
+        }
+    }
+}
+
+#endif
+
diff --git a/core/mc.c b/core/mc.c
new file mode 100644
index 00000000..e7ff7541
--- /dev/null
+++ b/core/mc.c
@@ -0,0 +1,320 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+
+#include "mc.h"
+#include "clip1.h"
+
+#ifdef _MSC_VER
+#undef HAVE_MMXEXT  /* not finished now */
+#endif
+#ifdef HAVE_MMXEXT
+#   include "i386/mc.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#   include "ppc/mc.h"
+#endif
+
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
+}
+
+static inline void pixel_avg( uint8_t *dst,  int i_dst_stride,
+                              uint8_t *src1, int i_src1_stride,
+                              uint8_t *src2, int i_src2_stride,
+                              int i_width, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+
+typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height );
+
+static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, i_width );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hh( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) + 16 ) >> 5 );
+        }
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hv( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter( &src[x], i_src_stride ) + 16 ) >> 5 );
+        }
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hc( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t *out;
+    uint8_t *pix;
+    int x, y;
+
+    for( x = 0; x < i_width; x++ )
+    {
+        int tap[6];
+
+        pix = &src[x];
+        out = &dst[x];
+
+        tap[0] = x264_tapfilter1( &pix[-2*i_src_stride] );
+        tap[1] = x264_tapfilter1( &pix[-1*i_src_stride] );
+        tap[2] = x264_tapfilter1( &pix[ 0*i_src_stride] );
+        tap[3] = x264_tapfilter1( &pix[ 1*i_src_stride] );
+        tap[4] = x264_tapfilter1( &pix[ 2*i_src_stride] );
+
+        for( y = 0; y < i_height; y++ )
+        {
+            tap[5] = x264_tapfilter1( &pix[ 3*i_src_stride] );
+
+            *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] + 20 * tap[3] -5*tap[4] + tap[5] + 512 ) >> 10 );
+
+            /* Next line */
+            pix += i_src_stride;
+            out += i_dst_stride;
+            tap[0] = tap[1];
+            tap[1] = tap[2];
+            tap[2] = tap[3];
+            tap[3] = tap[4];
+            tap[4] = tap[5];
+        }
+    }
+}
+
+/* mc I+H */
+static void mc_xy10( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src, i_src_stride, tmp, i_width, i_width, i_height );
+}
+static void mc_xy30( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src+1, i_src_stride, tmp, i_width, i_width, i_height );
+}
+/* mc I+V */
+static void mc_xy01( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src, i_src_stride, tmp, i_width, i_width, i_height );
+}
+static void mc_xy03( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, i_width, i_width, i_height );
+}
+/* H+V */
+static void mc_xy11( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy31( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src+1, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src,   i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy13( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src,              i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy33( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src+1,            i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy21( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy12( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hv( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy32( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src,   i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hv( src+1, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy23( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src,              i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src_stride,
+                                      uint8_t *dst, int i_dst_stride,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static pf_mc_t pf_mc[4][4] =    /*XXX [dqy][dqx] */
+    {
+        { mc_copy,  mc_xy10,    mc_hh,      mc_xy30 },
+        { mc_xy01,  mc_xy11,    mc_xy21,    mc_xy31 },
+        { mc_hv,    mc_xy12,    mc_hc,      mc_xy32 },
+        { mc_xy03,  mc_xy13,    mc_xy23,    mc_xy33 },
+    };
+
+    src += (mvy >> 2) * i_src_stride + (mvx >> 2);
+    pf_mc[mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_width, i_height );
+}
+
+/* full chroma mc (ie until 1/8 pixel)*/
+static void motion_compensation_chroma( uint8_t *src, int i_src_stride,
+                                        uint8_t *dst, int i_dst_stride,
+                                        int mvx, int mvy,
+                                        int i_width, int i_height )
+{
+    uint8_t *srcp;
+    int x, y;
+
+    const int d8x = mvx&0x07;
+    const int d8y = mvy&0x07;
+
+    const int cA = (8-d8x)*(8-d8y);
+    const int cB = d8x    *(8-d8y);
+    const int cC = (8-d8x)*d8y;
+    const int cD = d8x    *d8y;
+
+    src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
+    srcp = &src[i_src_stride];
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = ( cA*src[x]  + cB*src[x+1] +
+                       cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6;
+        }
+        dst  += i_dst_stride;
+
+        src   = srcp;
+        srcp += i_src_stride;
+    }
+}
+
+void x264_mc_init( int cpu, x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA]   = motion_compensation_luma;
+    pf[MC_CHROMA] = motion_compensation_chroma;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_mc_mmxext_init( pf );
+    }
+#endif
+#ifdef HAVE_ALTIVEC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        x264_mc_altivec_init( pf );
+    }
+#endif
+}
+
diff --git a/core/mc.h b/core/mc.h
new file mode 100644
index 00000000..8f91eab1
--- /dev/null
+++ b/core/mc.h
@@ -0,0 +1,45 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _MC_H
+#define _MC_H 1
+
+/* Do the MC
+ * XXX: Only width = 4, 8 or 16 are valid
+ * width == 4 -> height == 4 or 8
+ * width == 8 -> height == 4 or 8 or 16
+ * width == 16-> height == 8 or 16
+ * */
+
+typedef void (*x264_mc_function_t)(uint8_t *, int, uint8_t *, int,
+                          int mvx, int mvy,
+                          int i_width, int i_height );
+enum
+{
+    MC_LUMA   = 0,
+    MC_CHROMA = 1,
+};
+
+void x264_mc_init( int cpu, x264_mc_function_t pf[2] );
+
+#endif
diff --git a/core/mdate.c b/core/mdate.c
new file mode 100644
index 00000000..5b4a2a91
--- /dev/null
+++ b/core/mdate.c
@@ -0,0 +1,48 @@
+/*****************************************************************************
+ * mdate.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mdate.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#include <sys/time.h>
+#else
+#include <sys/types.h>
+#include <sys/timeb.h>
+#endif
+#include <time.h>
+
+int64_t x264_mdate( void )
+{
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+    struct timeval tv_date;
+
+    gettimeofday( &tv_date, NULL );
+    return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
+#else
+    struct _timeb tb;
+    _ftime(&tb);
+    return ((int64_t)tb.time * (1000) + (int64_t)tb.millitm) * (1000);
+#endif
+}
+
diff --git a/core/pixel.c b/core/pixel.c
new file mode 100644
index 00000000..f06a099e
--- /dev/null
+++ b/core/pixel.c
@@ -0,0 +1,228 @@
+/*****************************************************************************
+ * pixel.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "pixel.h"
+
+#ifdef HAVE_MMXEXT
+#   include "i386/pixel.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#   include "ppc/pixel.h"
+#endif
+
+
+/****************************************************************************
+ * pixel_sad_WxH
+ ****************************************************************************/
+#define PIXEL_SAD_C( name, lx, ly ) \
+static int name( uint8_t *pix1, int i_stride_pix1,  \
+                 uint8_t *pix2, int i_stride_pix2 ) \
+{                                                   \
+    int i_sum = 0;                                  \
+    int x, y;                                       \
+    for( y = 0; y < ly; y++ )                       \
+    {                                               \
+        for( x = 0; x < lx; x++ )                   \
+        {                                           \
+            i_sum += abs( pix1[x] - pix2[x] );      \
+        }                                           \
+        pix1 += i_stride_pix1;                      \
+        pix2 += i_stride_pix2;                      \
+    }                                               \
+    return i_sum;                                   \
+}
+
+
+PIXEL_SAD_C( pixel_sad_16x16, 16, 16 )
+PIXEL_SAD_C( pixel_sad_16x8,  16,  8 )
+PIXEL_SAD_C( pixel_sad_8x16,   8, 16 )
+PIXEL_SAD_C( pixel_sad_8x8,    8,  8 )
+PIXEL_SAD_C( pixel_sad_8x4,    8,  4 )
+PIXEL_SAD_C( pixel_sad_4x8,    4,  8 )
+PIXEL_SAD_C( pixel_sad_4x4,    4,  4 )
+
+static void pixel_sub_4x4( int16_t diff[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    int y, x;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            diff[y][x] = pix1[x] - pix2[x];
+        }
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+}
+
+static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
+{
+    int16_t tmp[4][4];
+    int16_t diff[4][4];
+    int x, y;
+    int i_satd = 0;
+
+    for( y = 0; y < i_height; y += 4 )
+    {
+        for( x = 0; x < i_width; x += 4 )
+        {
+            int d;
+
+            pixel_sub_4x4( diff, &pix1[x], i_pix1, &pix2[x], i_pix2 );
+
+            for( d = 0; d < 4; d++ )
+            {
+                int s01, s23;
+                int d01, d23;
+
+                s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
+                d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];
+
+                tmp[d][0] = s01 + s23;
+                tmp[d][1] = s01 - s23;
+                tmp[d][2] = d01 - d23;
+                tmp[d][3] = d01 + d23;
+            }
+            for( d = 0; d < 4; d++ )
+            {
+                int s01, s23;
+                int d01, d23;
+
+                s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
+                d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];
+
+                i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
+            }
+
+        }
+        pix1 += 4 * i_pix1;
+        pix2 += 4 * i_pix2;
+    }
+
+    return i_satd / 2;
+}
+#define PIXEL_SATD_C( name, width, height ) \
+static int name( uint8_t *pix1, int i_stride_pix1, \
+                 uint8_t *pix2, int i_stride_pix2 ) \
+{ \
+    return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
+}
+PIXEL_SATD_C( pixel_satd_16x16, 16, 16 )
+PIXEL_SATD_C( pixel_satd_16x8,  16, 8 )
+PIXEL_SATD_C( pixel_satd_8x16,  8, 16 )
+PIXEL_SATD_C( pixel_satd_8x8,   8, 8 )
+PIXEL_SATD_C( pixel_satd_8x4,   8, 4 )
+PIXEL_SATD_C( pixel_satd_4x8,   4, 8 )
+PIXEL_SATD_C( pixel_satd_4x4,   4, 4 )
+
+
+static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height )
+{
+    int x, y;
+    for( y = 0; y < height; y++ )
+    {
+        for( x = 0; x < width; x++ )
+        {
+            dst[x] = ( dst[x] + src[x] + 1 ) >> 1;
+        }
+        dst += i_dst;
+        src += i_src;
+    }
+}
+
+
+#define PIXEL_AVG_C( name, width, height ) \
+static void name( uint8_t *pix1, int i_stride_pix1, \
+                  uint8_t *pix2, int i_stride_pix2 ) \
+{ \
+    pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
+}
+PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
+PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
+PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
+PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
+PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
+PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
+PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
+
+/****************************************************************************
+ * x264_pixel_init:
+ ****************************************************************************/
+void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
+{
+    pixf->sad[PIXEL_16x16] = pixel_sad_16x16;
+    pixf->sad[PIXEL_16x8]  = pixel_sad_16x8;
+    pixf->sad[PIXEL_8x16]  = pixel_sad_8x16;
+    pixf->sad[PIXEL_8x8]   = pixel_sad_8x8;
+    pixf->sad[PIXEL_8x4]   = pixel_sad_8x4;
+    pixf->sad[PIXEL_4x8]   = pixel_sad_4x8;
+    pixf->sad[PIXEL_4x4]   = pixel_sad_4x4;
+
+    pixf->satd[PIXEL_16x16]= pixel_satd_16x16;
+    pixf->satd[PIXEL_16x8] = pixel_satd_16x8;
+    pixf->satd[PIXEL_8x16] = pixel_satd_8x16;
+    pixf->satd[PIXEL_8x8]  = pixel_satd_8x8;
+    pixf->satd[PIXEL_8x4]  = pixel_satd_8x4;
+    pixf->satd[PIXEL_4x8]  = pixel_satd_4x8;
+    pixf->satd[PIXEL_4x4]  = pixel_satd_4x4;
+
+    pixf->avg[PIXEL_16x16]= pixel_avg_16x16;
+    pixf->avg[PIXEL_16x8] = pixel_avg_16x8;
+    pixf->avg[PIXEL_8x16] = pixel_avg_8x16;
+    pixf->avg[PIXEL_8x8]  = pixel_avg_8x8;
+    pixf->avg[PIXEL_8x4]  = pixel_avg_8x4;
+    pixf->avg[PIXEL_4x8]  = pixel_avg_4x8;
+    pixf->avg[PIXEL_4x4]  = pixel_avg_4x4;
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext;
+        pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_mmxext;
+        pixf->sad[PIXEL_8x16 ] = x264_pixel_sad_8x16_mmxext;
+        pixf->sad[PIXEL_8x8  ] = x264_pixel_sad_8x8_mmxext;
+        pixf->sad[PIXEL_8x4  ] = x264_pixel_sad_8x4_mmxext;
+        pixf->sad[PIXEL_4x8  ] = x264_pixel_sad_4x8_mmxext;
+        pixf->sad[PIXEL_4x4]   = x264_pixel_sad_4x4_mmxext;
+
+        pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext;
+        pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext;
+        pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext;
+        pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_mmxext;
+        pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_mmxext;
+        pixf->satd[PIXEL_4x8]  = x264_pixel_satd_4x8_mmxext;
+        pixf->satd[PIXEL_4x4]  = x264_pixel_satd_4x4_mmxext;
+    }
+#endif
+#ifdef HAVE_ALTIVEC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        x264_pixel_altivec_init( pixf );
+    }
+#endif
+}
+
diff --git a/core/pixel.h b/core/pixel.h
new file mode 100644
index 00000000..a8055b5c
--- /dev/null
+++ b/core/pixel.h
@@ -0,0 +1,62 @@
+/*****************************************************************************
+ * pixel.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PIXEL_H
+#define _PIXEL_H 1
+
+typedef int  (*x264_pixel_sad_t) ( uint8_t *, int, uint8_t *, int );
+typedef int  (*x264_pixel_satd_t)( uint8_t *, int, uint8_t *, int );
+typedef void (*x264_pixel_avg_t) ( uint8_t *, int, uint8_t *, int );
+
+enum
+{
+    PIXEL_16x16 = 0,
+    PIXEL_16x8  = 1,
+    PIXEL_8x16  = 2,
+    PIXEL_8x8   = 3,
+    PIXEL_8x4   = 4,
+    PIXEL_4x8   = 5,
+    PIXEL_4x4   = 6,
+};
+
+static const struct {
+    int w;
+    int h;
+} x264_pixel_size[7] = {
+    { 16, 16 },
+    { 16,  8 }, {  8, 16 },
+    {  8,  8 },
+    {  8,  4 }, {  4,  8 },
+    {  4,  4 }
+};
+
+typedef struct
+{
+    x264_pixel_sad_t  sad[7];
+    x264_pixel_satd_t satd[7];
+    x264_pixel_avg_t  avg[7];
+} x264_pixel_function_t;
+
+void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
+
+#endif
diff --git a/core/ppc/mc.c b/core/ppc/mc.c
new file mode 100644
index 00000000..f4b54d77
--- /dev/null
+++ b/core/ppc/mc.c
@@ -0,0 +1,681 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "../mc.h"
+#include "../clip1.h"
+#include "mc.h"
+#include "ppccommon.h"
+
+typedef void (*pf_mc_t)( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height );
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
+           pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
+           pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
+           pix[ 3];
+}
+
+/* pixel_avg */
+static inline void pixel_avg_w4( uint8_t *dst,  int i_dst,
+                                 uint8_t *src1, int i_src1,
+                                 uint8_t *src2, int i_src2,
+                                 int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst;
+        src1 += i_src1;
+        src2 += i_src2;
+    }
+}
+static inline void pixel_avg_w8( uint8_t *dst,  int i_dst,
+                                 uint8_t *src1, int i_src1,
+                                 uint8_t *src2, int i_src2,
+                                 int i_height )
+{
+    /* TODO - optimize */
+    pixel_avg_w4( &dst[0], i_dst, &src1[0], i_src1, &src2[0], i_src2,
+                  i_height );
+    pixel_avg_w4( &dst[4], i_dst, &src1[4], i_src1, &src2[4], i_src2,
+                  i_height );
+}
+static inline void pixel_avg_w16( uint8_t *dst,  int i_dst,
+                                  uint8_t *src1, int i_src1,
+                                  uint8_t *src2, int i_src2,
+                                  int i_height )
+{
+    int y;
+    vector_u8_t src1v, src2v;
+    for( y = 0; y < i_height; y++ )
+    {
+        LOAD_16( src1, src1v );
+        LOAD_16( src2, src2v );
+        src1v = vec_avg( src1v, src2v );
+        STORE_16( src1v, dst );
+
+        dst  += i_dst;
+        src1 += i_src1;
+        src2 += i_src2;
+    }
+}
+
+/* mc_copy: plain c */
+#define MC_COPY( name, a )                                \
+static void name( uint8_t *src, int i_src,                \
+                  uint8_t *dst, int i_dst, int i_height ) \
+{                                                         \
+    int y;                                                \
+    for( y = 0; y < i_height; y++ )                       \
+    {                                                     \
+        memcpy( dst, src, a );                            \
+        src += i_src;                                     \
+        dst += i_dst;                                     \
+    }                                                     \
+}
+MC_COPY( mc_copy_w4,  4  )
+MC_COPY( mc_copy_w8,  8  )
+MC_COPY( mc_copy_w16, 16 )
+
+/* TAP_FILTER:
+   a is source (vector_s16_t [6])
+   b is a temporary vector_s16_t
+   c is the result
+
+   c   = src[0] + a[5] - 5 * ( a[1] + a[4] ) + 20 * ( a[2] + a[3] );
+   c  += 16;
+   c >>= 5;
+   c  += 80; */
+#define TAP_FILTER( a, b, c )                       \
+    c = vec_add( a[0], a[5] );                      \
+    b = vec_add( a[1], a[4] );                      \
+    c = vec_sub( c, b );                            \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_sub( c, b );                            \
+    b = vec_add( a[2], a[3] );                      \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_add( c, b );                            \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_add( c, b );                            \
+    c = vec_add( c, vec_splat_s16( 8 ) );           \
+    c = vec_add( c, vec_splat_s16( 8 ) );           \
+    c = vec_sr( c, vec_splat_u16( 5 ) );            \
+    c = vec_add( c, vec_sl( vec_splat_s16( 5 ),     \
+                            vec_splat_u16( 4 ) ) );
+
+/* mc_hh */
+static inline void mc_hh_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) +
+                                      16 ) >> 5 );
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hh_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    DECLARE_ALIGNED( int16_t, tmp[8], 16 );
+
+    LOAD_ZERO;
+    vector_u8_t    loadv;
+    vector_s16_t   srcv[6];
+    vector_u8_t  * _srcv = (vector_u8_t*) srcv;
+    vector_s16_t   dstv;
+    vector_s16_t   tmpv;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        LOAD_16( &src[-2], loadv );
+
+        for( x = 0; x < 6; x++ )
+        {
+            _srcv[x] = vec_perm( loadv, zero_u8,
+                                 vec_lvsl( 0, (int*) x ) );
+            CONVERT_U8_TO_S16( srcv[x] );
+        }
+
+        TAP_FILTER( srcv, tmpv, dstv );
+        vec_st( dstv, 0, tmp );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1_table[tmp[x]];
+        }
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hh_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hh_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc_hv */
+static inline void mc_hv_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter( &src[x], i_src ) +
+                                      16 ) >> 5 );
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hv_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    DECLARE_ALIGNED( int16_t, tmp[8], 16 );
+
+    LOAD_ZERO;
+    vector_s16_t   srcv[6];
+    vector_u8_t  * _srcv = (vector_u8_t*) srcv;
+    vector_s16_t   dstv;
+    vector_s16_t   tmpv;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        if( y )
+        {
+            for( x = 0; x < 5; x++ )
+            {
+                srcv[x] = srcv[x+1];
+            }
+            LOAD_8( &src[3*i_src], _srcv[5] );
+            CONVERT_U8_TO_S16( srcv[5] );
+        }
+        else
+        {
+            for( x = 0; x < 6; x++ )
+            {
+                LOAD_8( &src[(x-2)*i_src], _srcv[x] );
+                CONVERT_U8_TO_S16( srcv[x] );
+            }
+        }
+
+        TAP_FILTER( srcv, tmpv, dstv );
+        vec_st( dstv, 0, tmp );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1_table[tmp[x]];
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hv_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hv_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hv_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc_hc */
+static inline void mc_hc_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t *out;
+    uint8_t *pix;
+    int x, y;
+
+    for( x = 0; x < 4; x++ )
+    {
+        int tap[6];
+
+        pix = &src[x];
+        out = &dst[x];
+
+        tap[0] = x264_tapfilter1( &pix[-2*i_src] );
+        tap[1] = x264_tapfilter1( &pix[-1*i_src] );
+        tap[2] = x264_tapfilter1( &pix[ 0*i_src] );
+        tap[3] = x264_tapfilter1( &pix[ 1*i_src] );
+        tap[4] = x264_tapfilter1( &pix[ 2*i_src] );
+
+        for( y = 0; y < i_height; y++ )
+        {
+            tap[5] = x264_tapfilter1( &pix[ 3*i_src] );
+
+            *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] +
+                                    20 * tap[3] -5*tap[4] + tap[5] +
+                                    512 ) >> 10 );
+
+            /* Next line */
+            pix += i_src;
+            out += i_dst;
+            tap[0] = tap[1];
+            tap[1] = tap[2];
+            tap[2] = tap[3];
+            tap[3] = tap[4];
+            tap[4] = tap[5];
+        }
+    }
+}
+static inline void mc_hc_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    /* TODO: optimize */
+    mc_hc_w4( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hc_w4( &src[4], i_src, &dst[4], i_dst, i_height );
+}
+static inline void mc_hc_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hc_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hc_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc I+H */
+static void mc_xy10_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hh_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
+}
+static void mc_xy10_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hh_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
+}
+static void mc_xy10_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
+}
+
+static void mc_xy30_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hh_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src + 1, i_src, tmp, 4, i_height );
+}
+static void mc_xy30_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hh_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src + 1, i_src, tmp, 8, i_height );
+}
+static void mc_xy30_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src + 1, i_src, tmp, 16, i_height );
+}
+
+/* mc I+V */
+static void mc_xy01_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hv_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
+}
+static void mc_xy01_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hv_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
+}
+static void mc_xy01_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
+}
+
+static void mc_xy03_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hv_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src + i_src, i_src, tmp, 4, i_height );
+}
+static void mc_xy03_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hv_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src + i_src, i_src, tmp, 8, i_height );
+}
+static void mc_xy03_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src + i_src, i_src, tmp, 16, i_height );
+}
+
+/* H+V */
+static void mc_xy11_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy11_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy11_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy31_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src+1, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src,   i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy31_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src+1, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src,   i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy31_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src+1, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src,   i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy13_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src,       i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy13_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src,       i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy13_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src,       i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy33_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src+1,     i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy33_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src+1,     i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy33_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src+1,     i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy21_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy21_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy21_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy12_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src, i_src, tmp1, 4, i_height );
+    mc_hv_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy12_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src, i_src, tmp1, 8, i_height );
+    mc_hv_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy12_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src, i_src, tmp1, 16, i_height );
+    mc_hv_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy32_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src,   i_src, tmp1, 4, i_height );
+    mc_hv_w4( src+1, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy32_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src,   i_src, tmp1, 8, i_height );
+    mc_hv_w8( src+1, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy32_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src,   i_src, tmp1, 16, i_height );
+    mc_hv_w16( src+1, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy23_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src,       i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy23_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src,       i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy23_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src,       i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src,
+                                      uint8_t *dst, int i_dst,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
+    {
+        {
+            { mc_copy_w4,  mc_xy10_w4,    mc_hh_w4,      mc_xy30_w4 },
+            { mc_xy01_w4,  mc_xy11_w4,    mc_xy21_w4,    mc_xy31_w4 },
+            { mc_hv_w4,    mc_xy12_w4,    mc_hc_w4,      mc_xy32_w4 },
+            { mc_xy03_w4,  mc_xy13_w4,    mc_xy23_w4,    mc_xy33_w4 },
+        },
+        {
+            { mc_copy_w8,  mc_xy10_w8,    mc_hh_w8,      mc_xy30_w8 },
+            { mc_xy01_w8,  mc_xy11_w8,    mc_xy21_w8,    mc_xy31_w8 },
+            { mc_hv_w8,    mc_xy12_w8,    mc_hc_w8,      mc_xy32_w8 },
+            { mc_xy03_w8,  mc_xy13_w8,    mc_xy23_w8,    mc_xy33_w8 },
+        },
+        {
+            { mc_copy_w16,  mc_xy10_w16,    mc_hh_w16,      mc_xy30_w16 },
+            { mc_xy01_w16,  mc_xy11_w16,    mc_xy21_w16,    mc_xy31_w16 },
+            { mc_hv_w16,    mc_xy12_w16,    mc_hc_w16,      mc_xy32_w16 },
+            { mc_xy03_w16,  mc_xy13_w16,    mc_xy23_w16,    mc_xy33_w16 },
+        }
+    };
+
+    src += (mvy >> 2) * i_src + (mvx >> 2);
+    if( i_width == 4 )
+    {
+        pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+    else if( i_width == 8 )
+    {
+        pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+    else if( i_width == 16 )
+    {
+        pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+}
+
+void x264_mc_altivec_init( x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA] = motion_compensation_luma;
+}
diff --git a/core/ppc/mc.h b/core/ppc/mc.h
new file mode 100644
index 00000000..cf006f2b
--- /dev/null
+++ b/core/ppc/mc.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PPC_MC_H
+#define _PPC_MC_H 1
+
+void x264_mc_altivec_init( x264_mc_function_t pf[2] );
+
+#endif
diff --git a/core/ppc/pixel.c b/core/ppc/pixel.c
new file mode 100644
index 00000000..16cbcc5d
--- /dev/null
+++ b/core/ppc/pixel.c
@@ -0,0 +1,215 @@
+/*****************************************************************************
+ * pixel.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "../pixel.h"
+#include "pixel.h"
+#include "ppccommon.h"
+
+/* sad routines */
+#define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b )        \
+static int name( uint8_t *pix1, int i_pix1,            \
+                 uint8_t *pix2, int i_pix2 )           \
+{                                                      \
+    int y;                                             \
+    DECLARE_ALIGNED( int, sum, 16 );                   \
+                                                       \
+    LOAD_ZERO;                                         \
+    vector_u8_t  pix1v, pix2v;                         \
+    vector_s32_t sumv = zero_s32;                      \
+    for( y = 0; y < ly; y++ )                          \
+    {                                                  \
+        LOAD_##lx( pix1, pix1v );                      \
+        LOAD_##lx( pix2, pix2v );                      \
+        sumv = (vector_s32_t) vec_sum4s(               \
+                   vec_sub( vec_max( pix1v, pix2v ),   \
+                            vec_min( pix1v, pix2v ) ), \
+                   (vector_u32_t) sumv );              \
+        pix1 += i_pix1;                                \
+        pix2 += i_pix2;                                \
+    }                                                  \
+    sumv = vec_sum##a( sumv, zero_s32 );               \
+    vec_ste( vec_splat( sumv, b ), 0, &sum );          \
+    return sum;                                        \
+}
+
+PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s,  3 )
+PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec,  8,  16, 2s, 1 )
+PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec,  16, 8,  s,  3 )
+PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
+
+/* satd routines */
+static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
+                                          uint8_t *pix2, int i_pix2 )
+{
+    int i;
+    DECLARE_ALIGNED( int, i_satd, 16 );
+
+    LOAD_ZERO;
+    vector_s32_t satdv = zero_s32;
+    vector_u8_t  pix1u8v, pix2u8v;
+    vector_s16_t pix1s16v, pix2s16v;
+    vector_s16_t diffv[8];
+    vector_s16_t tmpv[8];
+    vector_s16_t s01v, s23v, d01v, d23v;
+
+    /* Diff 8x8 */
+    for( i = 0; i < 8; i++ )
+    {
+        LOAD_8( pix1, pix1u8v );
+        LOAD_8( pix2, pix2u8v );
+
+        /* u8 -> s16 conversion */
+        pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
+        pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+
+        diffv[i] = vec_sub( pix1s16v, pix2s16v );
+
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    /* Hadamar H */
+    HADAMAR( &diffv[0], &tmpv[0] );
+    HADAMAR( &diffv[4], &tmpv[4] );
+
+    /* Transpose */
+    TRANSPOSE8x8( tmpv, diffv );
+
+    /* Hadamar V */
+    HADAMAR( &diffv[0], &tmpv[0] );
+    HADAMAR( &diffv[4], &tmpv[4] );
+
+    /* Sum of absolute values */
+    for( i = 0; i < 8; i++ )
+    {
+        satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
+    }
+    satdv = vec_sums( satdv, zero_s32 );
+
+    /* Done */
+    vec_ste( vec_splat( satdv, 3 ), 0, &i_satd );
+    return i_satd / 2;
+}
+
+static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
+                                    uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8], i_pix1,
+                                   &pix2[8], i_pix2 );
+}
+static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
+                                    uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1], i_pix1,
+                                   &pix2[8*i_pix2], i_pix2 );
+}
+static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
+                                     uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8], i_pix1,
+                                   &pix2[8], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1], i_pix1,
+                                   &pix2[8*i_pix2], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1+8], i_pix1,
+                                   &pix2[8*i_pix2+8], i_pix2 );
+}
+
+static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
+                                          uint8_t *pix2, int i_pix2 )
+{
+    int i;
+    DECLARE_ALIGNED( int, i_satd, 16 );
+
+    LOAD_ZERO;
+    vector_s32_t satdv = zero_s32;
+    vector_u8_t  pix1u8v, pix2u8v;
+    vector_s16_t pix1s16v, pix2s16v;
+    vector_s16_t diffv[4];
+    vector_s16_t tmpv[4];
+    vector_s16_t s01v, s23v, d01v, d23v;
+
+    /* Diff 4x8 */
+    for( i = 0; i < 4; i++ )
+    {
+        LOAD_4( pix1, pix1u8v );
+        LOAD_4( pix2, pix2u8v );
+
+        /* u8 -> s16 conversion */
+        pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
+        pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+
+        diffv[i] = vec_sub( pix1s16v, pix2s16v );
+
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    /* Hadamar H */
+    HADAMAR( diffv, tmpv );
+
+    /* Transpose */
+    TRANSPOSE4x4( tmpv, diffv );
+
+    /* Hadamar V */
+    HADAMAR( diffv, tmpv );
+
+    /* Sum of absolute values */
+    for( i = 0; i < 4; i++ )
+    {
+        satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
+    }
+    satdv = vec_sum2s( satdv, zero_s32 );
+
+    /* Done */
+    vec_ste( vec_splat( satdv, 1 ), 0, &i_satd );
+    return i_satd / 2;
+}
+
+/****************************************************************************
+ * x264_pixel_init:
+ ****************************************************************************/
+void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
+{
+    pixf->sad[PIXEL_16x16]  = pixel_sad_16x16_altivec;
+    pixf->sad[PIXEL_8x16]   = pixel_sad_8x16_altivec;
+    pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
+    pixf->sad[PIXEL_8x8]    = pixel_sad_8x8_altivec;
+
+    pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
+    pixf->satd[PIXEL_8x16]  = pixel_satd_8x16_altivec;
+    pixf->satd[PIXEL_16x8]  = pixel_satd_16x8_altivec;
+    pixf->satd[PIXEL_8x8]   = pixel_satd_8x8_altivec;
+    pixf->satd[PIXEL_4x4]   = pixel_satd_4x4_altivec;
+}
diff --git a/core/ppc/pixel.h b/core/ppc/pixel.h
new file mode 100644
index 00000000..f2d6a181
--- /dev/null
+++ b/core/ppc/pixel.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PPC_PIXEL_H
+#define _PPC_PIXEL_H 1
+
+void x264_pixel_altivec_init( x264_pixel_function_t *pixf );
+
+#endif
diff --git a/core/ppc/ppccommon.h b/core/ppc/ppccommon.h
new file mode 100644
index 00000000..15f22f65
--- /dev/null
+++ b/core/ppc/ppccommon.h
@@ -0,0 +1,158 @@
+/*****************************************************************************
+ * ppccommon.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ppccommon.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* Handy */
+#define vector_u8_t  vector unsigned char
+#define vector_s16_t vector signed short
+#define vector_u32_t vector unsigned int
+#define vector_s32_t vector signed int
+
+#define LOAD_ZERO    vector_s32_t zero = vec_splat_s32( 0 )
+#define zero_u8      (vector_u8_t)  zero
+#define zero_s16     (vector_s16_t) zero
+#define zero_s32     (vector_s32_t) zero
+
+#define CONVERT_U8_TO_S16( a ) \
+    a = (vector_s16_t) vec_mergeh( zero_u8, (vector_u8_t) a )
+
+/* Macros to load aligned or unaligned data without risking buffer
+   overflows. */
+#define LOAD_16( p, v )                                \
+    if( (int) p & 0xF )                                \
+    {                                                  \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
+                      vec_lvsl( 0, p ) );              \
+    }                                                  \
+    else                                               \
+    {                                                  \
+        v = vec_ld( 0, p );                            \
+    }
+
+#define LOAD_8( p, v )                                             \
+    if( !( (int) p & 0xF ) )                                       \
+    {                                                              \
+        v = vec_ld( 0, p );                                        \
+    }                                                              \
+    else if( ( (int) p & 0xF ) < 9 )                               \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
+                      vec_lvsl( 0, p ) );                          \
+    }                                                              \
+    else                                                           \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ),             \
+                      vec_lvsl( 0, p ) );                          \
+    }
+
+#define LOAD_4( p, v )                                             \
+    if( !( (int) p & 0xF ) )                                       \
+    {                                                              \
+        v = vec_ld( 0, p );                                        \
+    }                                                              \
+    else if( ( (int) p & 0xF ) < 13 )                              \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
+                      vec_lvsl( 0, p ) );                          \
+    }                                                              \
+    else                                                           \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ),             \
+                      vec_lvsl( 0, p ) );                          \
+    }
+
+/* Store aligned or unaligned data */
+#define STORE_16( v, p )                              \
+    if( (int) p & 0xF )                               \
+    {                                                 \
+        vector unsigned char tmp1, tmp2;              \
+        vector unsigned char align, mask;             \
+        tmp1 = vec_ld( 0, p );                        \
+        tmp2 = vec_ld( 16, p );                       \
+        align = vec_lvsr( 0, p );                     \
+        mask = vec_perm( (vector unsigned char) (0),  \
+                         (vector unsigned char) (-1), \
+                         align);                      \
+        v = vec_perm( v, v, align);                   \
+        tmp1 = vec_sel( tmp1, v, mask );              \
+        tmp2 = vec_sel( v, tmp2, mask );              \
+        vec_st( tmp1, 0, p );                         \
+        vec_st( tmp2, 16, p );                        \
+    }                                                 \
+    else                                              \
+    {                                                 \
+        vec_st( v, 0, p );                            \
+    }
+
+/* Transpose 8x8 (vector_s16_t [8]) */
+#define TRANSPOSE8x8( a, b )           \
+    b[0] = vec_mergeh( a[0], a[4] ); \
+    b[1] = vec_mergel( a[0], a[4] ); \
+    b[2] = vec_mergeh( a[1], a[5] ); \
+    b[3] = vec_mergel( a[1], a[5] ); \
+    b[4] = vec_mergeh( a[2], a[6] ); \
+    b[5] = vec_mergel( a[2], a[6] ); \
+    b[6] = vec_mergeh( a[3], a[7] ); \
+    b[7] = vec_mergel( a[3], a[7] ); \
+    a[0] = vec_mergeh( b[0], b[4] ); \
+    a[1] = vec_mergel( b[0], b[4] ); \
+    a[2] = vec_mergeh( b[1], b[5] ); \
+    a[3] = vec_mergel( b[1], b[5] ); \
+    a[4] = vec_mergeh( b[2], b[6] ); \
+    a[5] = vec_mergel( b[2], b[6] ); \
+    a[6] = vec_mergeh( b[3], b[7] ); \
+    a[7] = vec_mergel( b[3], b[7] ); \
+    b[0] = vec_mergeh( a[0], a[4] ); \
+    b[1] = vec_mergel( a[0], a[4] ); \
+    b[2] = vec_mergeh( a[1], a[5] ); \
+    b[3] = vec_mergel( a[1], a[5] ); \
+    b[4] = vec_mergeh( a[2], a[6] ); \
+    b[5] = vec_mergel( a[2], a[6] ); \
+    b[6] = vec_mergeh( a[3], a[7] ); \
+    b[7] = vec_mergel( a[3], a[7] );
+
+/* Transpose 4x4 (vector_s16_t [4]) */
+#define TRANSPOSE4x4( a, b ) \
+    (b)[0] = vec_mergeh( (a)[0], zero_s16 ); \
+    (b)[1] = vec_mergeh( (a)[1], zero_s16 ); \
+    (b)[2] = vec_mergeh( (a)[2], zero_s16 ); \
+    (b)[3] = vec_mergeh( (a)[3], zero_s16 ); \
+    (a)[0] = vec_mergeh( (b)[0], (b)[2] );   \
+    (a)[1] = vec_mergel( (b)[0], (b)[2] );   \
+    (a)[2] = vec_mergeh( (b)[1], (b)[3] );   \
+    (a)[3] = vec_mergel( (b)[1], (b)[3] );   \
+    (b)[0] = vec_mergeh( (a)[0], (a)[2] );   \
+    (b)[1] = vec_mergel( (a)[0], (a)[2] );   \
+    (b)[2] = vec_mergeh( (a)[1], (a)[3] );   \
+    (b)[3] = vec_mergel( (a)[1], (a)[3] );
+
+/* Hadamar (vector_s16_t [4]) */
+#define HADAMAR( a, b ) \
+    s01v   = vec_add( (a)[0], (a)[1] ); \
+    s23v   = vec_add( (a)[2], (a)[3] ); \
+    d01v   = vec_sub( (a)[0], (a)[1] ); \
+    d23v   = vec_sub( (a)[2], (a)[3] ); \
+    (b)[0] = vec_add( s01v, s23v );     \
+    (b)[1] = vec_sub( s01v, s23v );     \
+    (b)[2] = vec_sub( d01v, d23v );     \
+    (b)[3] = vec_add( d01v, d23v );
+
diff --git a/core/predict.c b/core/predict.c
new file mode 100644
index 00000000..6a799a09
--- /dev/null
+++ b/core/predict.c
@@ -0,0 +1,697 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* XXX predict4x4 are inspired from ffmpeg h264 decoder
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "predict.h"
+
+#ifdef _MSC_VER
+#undef HAVE_MMXEXT  /* not finished now */
+#endif
+#ifdef HAVE_MMXEXT
+#   include "i386/predict.h"
+#endif
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/****************************************************************************
+ * 16x16 prediction for intra block DC, H, V, P
+ ****************************************************************************/
+static void predict_16x16_dc( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i, j;
+
+    /* calculate DC value */
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+        dc += src[i - i_stride];
+    }
+    dc = ( dc + 16 ) >> 5;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_left( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+    }
+    dc = ( dc + 8 ) >> 4;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_top( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[i - i_stride];
+    }
+    dc = ( dc + 8 ) >> 4;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_128( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+
+    }
+}
+static void predict_16x16_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+static void predict_16x16_p( uint8_t *src, int i_stride )
+{
+    int x, y, i;
+    int a, b, c;
+    int H = 0;
+    int V = 0;
+    int i00;
+
+    /* calcule H and V */
+    for( i = 0; i <= 7; i++ )
+    {
+        H += ( i + 1 ) * ( src[ 8 + i - i_stride ] - src[6 -i -i_stride] );
+        V += ( i + 1 ) * ( src[-1 + (8+i)*i_stride] - src[-1 + (6-i)*i_stride] );
+    }
+
+    a = 16 * ( src[-1 + 15*i_stride] + src[15 - i_stride] );
+    b = ( 5 * H + 32 ) >> 6;
+    c = ( 5 * V + 32 ) >> 6;
+
+    i00 = a - b * 7 - c * 7 + 16;
+
+    for( y = 0; y < 16; y++ )
+    {
+        for( x = 0; x < 16; x++ )
+        {
+            int pix;
+
+            pix = (i00+b*x)>>5;
+
+            src[x] = clip_uint8( pix );
+        }
+        src += i_stride;
+        i00 += c;
+    }
+}
+
+
+/****************************************************************************
+ * 8x8 prediction for intra chroma block DC, H, V, P
+ ****************************************************************************/
+static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+{
+    int x,y;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            src[x] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc0 = 0, dc1 = 0;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dc0 += src[y * i_stride     - 1];
+        dc1 += src[(y+4) * i_stride - 1];
+    }
+    dc0 = ( dc0 + 2 ) >> 2;
+    dc1 = ( dc1 + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            src[           x] = dc0;
+            src[4*i_stride+x] = dc1;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc0 = 0, dc1 = 0;
+
+    for( x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - i_stride];
+        dc1 += src[x + 4 - i_stride];
+    }
+    dc0 = ( dc0 + 2 ) >> 2;
+    dc1 = ( dc1 + 2 ) >> 2;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x    ] = dc0;
+            src[x + 4] = dc1;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    int dc0, dc1, dc2, dc3;
+    int i;
+
+    /* First do :
+          s0 s1
+       s2
+       s3
+    */
+    for( i = 0; i < 4; i++ )
+    {
+        s0 += src[i - i_stride];
+        s1 += src[i + 4 - i_stride];
+        s2 += src[-1 + i * i_stride];
+        s3 += src[-1 + (i+4)*i_stride];
+    }
+    /* now calculate
+       dc0 dc1
+       dc2 dc3
+     */
+    dc0 = ( s0 + s2 + 4 ) >> 3;
+    dc1 = ( s1 + 2 ) >> 2;
+    dc2 = ( s3 + 2 ) >> 2;
+    dc3 = ( s1 + s3 + 4 ) >> 3;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[             x    ] = dc0;
+            src[             x + 4] = dc1;
+            src[4*i_stride + x    ] = dc2;
+            src[4*i_stride + x + 4] = dc3;
+        }
+        src += i_stride;
+    }
+}
+
+static void predict_8x8_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 8; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+
+        for( j = 0; j < 8; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 8; i++ )
+    {
+        for( j = 0; j < 8; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+
+static void predict_8x8_p( uint8_t *src, int i_stride )
+{
+    int i;
+    int x,y;
+    int a, b, c;
+    int H = 0;
+    int V = 0;
+    int i00;
+
+    for( i = 0; i < 4; i++ )
+    {
+        H += ( i + 1 ) * ( src[4+i - i_stride] - src[2 - i -i_stride] );
+        V += ( i + 1 ) * ( src[-1 +(i+4)*i_stride] - src[-1+(2-i)*i_stride] );
+    }
+
+    a = 16 * ( src[-1+7*i_stride] + src[7 - i_stride] );
+    b = ( 17 * H + 16 ) >> 5;
+    c = ( 17 * V + 16 ) >> 5;
+    i00 = a -3*b -3*c + 16;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            int pix;
+
+            pix = (i00 +b*x) >> 5;
+            src[x] = clip_uint8( pix );
+        }
+        src += i_stride;
+        i00 += c;
+    }
+}
+
+/****************************************************************************
+ * 4x4 prediction for intra luma block DC, H, V, P
+ ****************************************************************************/
+static void predict_4x4_dc_128( uint8_t *src, int i_stride )
+{
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_left( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[-1+0*i_stride] + src[-1+i_stride]+
+               src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_top( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[0 - i_stride] + src[1 - i_stride] +
+               src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[-1+0*i_stride] + src[-1+i_stride]+
+               src[-1+2*i_stride] + src[-1+3*i_stride] +
+               src[0 - i_stride]  + src[1 - i_stride] +
+               src[2 - i_stride]  + src[3 - i_stride] + 4 ) >> 3;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+
+        for( j = 0; j < 4; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 4; i++ )
+    {
+        for( j = 0; j < 4; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+
+#define PREDICT_4x4_LOAD_LEFT \
+    const int l0 = src[-1+0*i_stride];   \
+    const int l1 = src[-1+1*i_stride];   \
+    const int l2 = src[-1+2*i_stride];   \
+    const int l3 = src[-1+3*i_stride];
+
+#define PREDICT_4x4_LOAD_TOP \
+    const int t0 = src[0-1*i_stride];   \
+    const int t1 = src[1-1*i_stride];   \
+    const int t2 = src[2-1*i_stride];   \
+    const int t3 = src[3-1*i_stride];
+
+#define PREDICT_4x4_LOAD_TOP_RIGHT \
+    const int t4 = src[4-1*i_stride];   \
+    const int t5 = src[5-1*i_stride];   \
+    const int t6 = src[6-1*i_stride];   \
+    const int t7 = src[7-1*i_stride];
+
+
+static void predict_4x4_ddl( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_TOP
+    PREDICT_4x4_LOAD_TOP_RIGHT
+
+    src[0*i_stride+0] = ( t0 + 2*t1+ t2 + 2 ) >> 2;
+
+    src[0*i_stride+1] =
+    src[1*i_stride+0] = ( t1 + 2*t2+ t3 + 2 ) >> 2;
+
+    src[0*i_stride+2] =
+    src[1*i_stride+1] =
+    src[2*i_stride+0] = ( t2 + 2*t3+ t4 + 2 ) >> 2;
+
+    src[0*i_stride+3] =
+    src[1*i_stride+2] =
+    src[2*i_stride+1] =
+    src[3*i_stride+0] = ( t3 + 2*t4+ t5 + 2 ) >> 2;
+
+    src[1*i_stride+3] =
+    src[2*i_stride+2] =
+    src[3*i_stride+1] = ( t4 + 2*t5+ t6 + 2 ) >> 2;
+
+    src[2*i_stride+3] =
+    src[3*i_stride+2] = ( t5 + 2*t6+ t7 + 2 ) >> 2;
+
+    src[3*i_stride+3] = ( t6 + 3 * t7 + 2 ) >> 2;
+}
+static void predict_4x4_ddr( uint8_t *src, int i_stride )
+{
+    const int lt = src[-1-i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+
+    src[0*i_stride+0] =
+    src[1*i_stride+1] =
+    src[2*i_stride+2] =
+    src[3*i_stride+3] = ( t0 + 2*lt +l0 + 2 ) >> 2;
+
+    src[0*i_stride+1] =
+    src[1*i_stride+2] =
+    src[2*i_stride+3] = ( lt + 2 * t0 + t1 + 2 ) >> 2;
+
+    src[0*i_stride+2] =
+    src[1*i_stride+3] = ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+
+    src[0*i_stride+3] = ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+
+    src[1*i_stride+0] =
+    src[2*i_stride+1] =
+    src[3*i_stride+2] = ( lt + 2 * l0 + l1 + 2 ) >> 2;
+
+    src[2*i_stride+0] =
+    src[3*i_stride+1] = ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+
+    src[3*i_stride+0] = ( l1 + 2 * l2 + l3 + 2 ) >> 2;
+}
+
+static void predict_4x4_vr( uint8_t *src, int i_stride )
+{
+    const int lt = src[-1-i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+    /* produce warning as l3 is unused */
+
+    src[0*i_stride+0]=
+    src[2*i_stride+1]= ( lt + t0 + 1 ) >> 1;
+
+    src[0*i_stride+1]=
+    src[2*i_stride+2]= ( t0 + t1 + 1 ) >> 1;
+
+    src[0*i_stride+2]=
+    src[2*i_stride+3]= ( t1 + t2 + 1 ) >> 1;
+
+    src[0*i_stride+3]= ( t2 + t3 + 1 ) >> 1;
+
+    src[1*i_stride+0]=
+    src[3*i_stride+1]= ( l0 + 2 * lt + t0 + 2 ) >> 2;
+
+    src[1*i_stride+1]=
+    src[3*i_stride+2]= ( lt + 2 * t0 + t1 + 2 ) >> 2;
+
+    src[1*i_stride+2]=
+    src[3*i_stride+3]= ( t0 + 2 * t1 + t2 + 2) >> 2;
+
+    src[1*i_stride+3]= ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+    src[2*i_stride+0]= ( lt + 2 * l0 + l1 + 2 ) >> 2;
+    src[3*i_stride+0]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+}
+
+static void predict_4x4_hd( uint8_t *src, int i_stride )
+{
+    const int lt= src[-1-1*i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+    /* produce warning as t3 is unused */
+
+    src[0*i_stride+0]=
+    src[1*i_stride+2]= ( lt + l0 + 1 ) >> 1;
+    src[0*i_stride+1]=
+    src[1*i_stride+3]= ( l0 + 2 * lt + t0 + 2 ) >> 2;
+    src[0*i_stride+2]= ( lt + 2 * t0 + t1 + 2 ) >> 2;
+    src[0*i_stride+3]= ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+    src[1*i_stride+0]=
+    src[2*i_stride+2]= ( l0 + l1 + 1 ) >> 1;
+    src[1*i_stride+1]=
+    src[2*i_stride+3]= ( lt + 2 * l0 + l1 + 2 ) >> 2;
+    src[2*i_stride+0]=
+    src[3*i_stride+2]= ( l1 + l2+ 1 ) >> 1;
+    src[2*i_stride+1]=
+    src[3*i_stride+3]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+    src[3*i_stride+0]= ( l2 + l3 + 1 ) >> 1;
+    src[3*i_stride+1]= ( l1 + 2 * l2 + l3 + 2 ) >> 2;
+}
+
+static void predict_4x4_vl( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_TOP
+    PREDICT_4x4_LOAD_TOP_RIGHT
+    /* produce warning as t7 is unused */
+
+    src[0*i_stride+0]= ( t0 + t1 + 1 ) >> 1;
+    src[0*i_stride+1]=
+    src[2*i_stride+0]= ( t1 + t2 + 1 ) >> 1;
+    src[0*i_stride+2]=
+    src[2*i_stride+1]= ( t2 + t3 + 1 ) >> 1;
+    src[0*i_stride+3]=
+    src[2*i_stride+2]= ( t3 + t4+ 1 ) >> 1;
+    src[2*i_stride+3]= ( t4 + t5+ 1 ) >> 1;
+    src[1*i_stride+0]= ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+    src[1*i_stride+1]=
+    src[3*i_stride+0]= ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+    src[1*i_stride+2]=
+    src[3*i_stride+1]= ( t2 + 2 * t3 + t4 + 2 ) >> 2;
+    src[1*i_stride+3]=
+    src[3*i_stride+2]= ( t3 + 2 * t4 + t5 + 2 ) >> 2;
+    src[3*i_stride+3]= ( t4 + 2 * t5 + t6 + 2 ) >> 2;
+}
+
+static void predict_4x4_hu( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_LEFT
+
+    src[0*i_stride+0]= ( l0 + l1 + 1 ) >> 1;
+    src[0*i_stride+1]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+
+    src[0*i_stride+2]=
+    src[1*i_stride+0]= ( l1 + l2 + 1 ) >> 1;
+
+    src[0*i_stride+3]=
+    src[1*i_stride+1]= ( l1 + 2*l2 + l3 + 2 ) >> 2;
+
+    src[1*i_stride+2]=
+    src[2*i_stride+0]= ( l2 + l3 + 1 ) >> 1;
+
+    src[1*i_stride+3]=
+    src[2*i_stride+1]= ( l2 + 2 * l3 + l3 + 2 ) >> 2;
+
+    src[2*i_stride+3]=
+    src[3*i_stride+1]=
+    src[3*i_stride+0]=
+    src[2*i_stride+2]=
+    src[3*i_stride+2]=
+    src[3*i_stride+3]= l3;
+}
+
+/****************************************************************************
+ * Exported functions:
+ ****************************************************************************/
+void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
+{
+    pf[I_PRED_16x16_V ]     = predict_16x16_v;
+    pf[I_PRED_16x16_H ]     = predict_16x16_h;
+    pf[I_PRED_16x16_DC]     = predict_16x16_dc;
+    pf[I_PRED_16x16_P ]     = predict_16x16_p;
+    pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
+    pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top;
+    pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_16x16_init_mmxext( pf );
+    }
+#endif
+}
+
+void x264_predict_8x8_init( int cpu, x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = predict_8x8_v;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
+    pf[I_PRED_CHROMA_P ]     = predict_8x8_p;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_8x8_init_mmxext( pf );
+    }
+#endif
+}
+
+void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
+{
+    pf[I_PRED_4x4_V]      = predict_4x4_v;
+    pf[I_PRED_4x4_H]      = predict_4x4_h;
+    pf[I_PRED_4x4_DC]     = predict_4x4_dc;
+    pf[I_PRED_4x4_DDL]    = predict_4x4_ddl;
+    pf[I_PRED_4x4_DDR]    = predict_4x4_ddr;
+    pf[I_PRED_4x4_VR]     = predict_4x4_vr;
+    pf[I_PRED_4x4_HD]     = predict_4x4_hd;
+    pf[I_PRED_4x4_VL]     = predict_4x4_vl;
+    pf[I_PRED_4x4_HU]     = predict_4x4_hu;
+    pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left;
+    pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top;
+    pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_4x4_init_mmxext( pf );
+    }
+#endif
+}
+
diff --git a/core/predict.h b/core/predict.h
new file mode 100644
index 00000000..988e57fb
--- /dev/null
+++ b/core/predict.h
@@ -0,0 +1,92 @@
+/*****************************************************************************
+ * predict.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PREDICT_H
+#define _PREDICT_H 1
+
+typedef void (*x264_predict_t)( uint8_t *src, int i_stride );
+
+enum intra_chroma_pred_e
+{
+    I_PRED_CHROMA_DC = 0,
+    I_PRED_CHROMA_H  = 1,
+    I_PRED_CHROMA_V  = 2,
+    I_PRED_CHROMA_P  = 3,
+
+    I_PRED_CHROMA_DC_LEFT = 4,
+    I_PRED_CHROMA_DC_TOP  = 5,
+    I_PRED_CHROMA_DC_128  = 6
+};
+static const int x264_mb_pred_mode8x8_fix[7] =
+{
+    I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
+    I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
+};
+
+enum intra16x16_pred_e
+{
+    I_PRED_16x16_V  = 0,
+    I_PRED_16x16_H  = 1,
+    I_PRED_16x16_DC = 2,
+    I_PRED_16x16_P  = 3,
+
+    I_PRED_16x16_DC_LEFT = 4,
+    I_PRED_16x16_DC_TOP  = 5,
+    I_PRED_16x16_DC_128  = 6,
+};
+static const int x264_mb_pred_mode16x16_fix[7] =
+{
+    I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P,
+    I_PRED_16x16_DC,I_PRED_16x16_DC,I_PRED_16x16_DC
+};
+
+enum intra4x4_pred_e
+{
+    I_PRED_4x4_V  = 0,
+    I_PRED_4x4_H  = 1,
+    I_PRED_4x4_DC = 2,
+    I_PRED_4x4_DDL= 3,
+    I_PRED_4x4_DDR= 4,
+    I_PRED_4x4_VR = 5,
+    I_PRED_4x4_HD = 6,
+    I_PRED_4x4_VL = 7,
+    I_PRED_4x4_HU = 8,
+
+    I_PRED_4x4_DC_LEFT = 9,
+    I_PRED_4x4_DC_TOP  = 10,
+    I_PRED_4x4_DC_128  = 11,
+};
+static const int x264_mb_pred_mode4x4_fix[12] =
+{
+    I_PRED_4x4_V,   I_PRED_4x4_H,   I_PRED_4x4_DC,
+    I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR,
+    I_PRED_4x4_HD,  I_PRED_4x4_VL,  I_PRED_4x4_HU,
+    I_PRED_4x4_DC,  I_PRED_4x4_DC,  I_PRED_4x4_DC
+};
+
+void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x8_init   ( int cpu, x264_predict_t pf[7] );
+void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
+
+
+#endif
diff --git a/core/set.h b/core/set.h
new file mode 100644
index 00000000..bfd75e84
--- /dev/null
+++ b/core/set.h
@@ -0,0 +1,123 @@
+/*****************************************************************************
+ * set.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _SET_H
+#define _SET_H 1
+
+enum profile_e
+{
+    PROFILE_BASELINE = 66,
+    PROFILE_MAIN = 77,
+    PROFILE_EXTENTED = 88
+};
+
+typedef struct
+{
+    int i_id;
+
+    int i_profile_idc;
+    int i_level_idc;
+
+    int b_constraint_set0;
+    int b_constraint_set1;
+    int b_constraint_set2;
+
+    int i_log2_max_frame_num;
+
+    int i_poc_type;
+    /* poc 0 */
+    int i_log2_max_poc_lsb;
+    /* poc 1 */
+    int b_delta_pic_order_always_zero;
+    int i_offset_for_non_ref_pic;
+    int i_offset_for_top_to_bottom_field;
+    int i_num_ref_frames_in_poc_cycle;
+    int i_offset_for_ref_frame[256];
+
+    int i_num_ref_frames;
+    int b_gaps_in_frame_num_value_allowed;
+    int i_mb_width;
+    int i_mb_height;
+    int b_frame_mbs_only;
+    int b_mb_adaptive_frame_field;
+    int b_direct8x8_inference;
+
+    int b_crop;
+    struct
+    {
+        int i_left;
+        int i_right;
+        int i_top;
+        int i_bottom;
+    } crop;
+
+    int b_vui;
+    struct
+    {
+        int i_sar_width;
+        int i_sar_height;
+        /* FIXME to complete */
+    } vui;
+
+} x264_sps_t;
+
+typedef struct
+{
+    int i_id;
+    int i_sps_id;
+
+    int b_cabac;
+
+    int b_pic_order;
+    int i_num_slice_groups;
+
+    int i_slice_group_map_type;
+    /* i_slice_group_map_type == 0 */
+    int i_run_length[256];      /* FIXME */
+    /* i_slice_group_map_type == 2 */
+    int i_top_left[256];        /* FIXME */
+    int i_bottom_right[256];    /* FIXME */
+    /* i_slice_group_map_type == 3, 4, 5 */
+    int b_slice_group_change_direction;
+    int i_slice_group_change_rate;
+    /* i_slice_group_map_type == 6 */
+    int i_pic_size_in_map_units;
+    int i_slice_group_id[256];  /* FIXME */
+
+    int i_num_ref_idx_l0_active;
+    int i_num_ref_idx_l1_active;
+
+    int b_weighted_pred;
+    int b_weighted_bipred;
+
+    int i_pic_init_qp;
+    int i_pic_init_qs;
+
+    int i_chroma_qp_index_offset;
+
+    int b_deblocking_filter_control;
+    int b_constrained_intra_pred;
+    int b_redundant_pic_cnt;
+} x264_pps_t;
+
+#endif
diff --git a/core/vlc.h b/core/vlc.h
new file mode 100644
index 00000000..45779435
--- /dev/null
+++ b/core/vlc.h
@@ -0,0 +1,914 @@
+/*****************************************************************************
+ * vlc.h : vlc table
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+typedef struct
+{
+    int i_bits;
+    int i_size;
+} vlc_t;
+
+/* XXX: don't forget to change it if you change vlc_t */
+#define MKVLC( a, b ) { a, b }
+static const vlc_t x264_coeff_token[5][17*4] =
+{
+    /* table 0 */
+    {
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 8 ), /* str=00000111 */
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x6, 8 ), /* str=00000110 */
+        MKVLC( 0x5, 7 ), /* str=0000101 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+
+        MKVLC( 0x7, 10 ), /* str=0000000111 */
+        MKVLC( 0x6, 9 ), /* str=000000110 */
+        MKVLC( 0x5, 8 ), /* str=00000101 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+
+        MKVLC( 0x7, 11 ), /* str=00000000111 */
+        MKVLC( 0x6, 10 ), /* str=0000000110 */
+        MKVLC( 0x5, 9 ), /* str=000000101 */
+        MKVLC( 0x4, 7 ), /* str=0000100 */
+
+        MKVLC( 0xf, 13 ), /* str=0000000001111 */
+        MKVLC( 0x6, 11 ), /* str=00000000110 */
+        MKVLC( 0x5, 10 ), /* str=0000000101 */
+        MKVLC( 0x4, 8 ), /* str=00000100 */
+
+        MKVLC( 0xb, 13 ), /* str=0000000001011 */
+        MKVLC( 0xe, 13 ), /* str=0000000001110 */
+        MKVLC( 0x5, 11 ), /* str=00000000101 */
+        MKVLC( 0x4, 9 ), /* str=000000100 */
+
+        MKVLC( 0x8, 13 ), /* str=0000000001000 */
+        MKVLC( 0xa, 13 ), /* str=0000000001010 */
+        MKVLC( 0xd, 13 ), /* str=0000000001101 */
+        MKVLC( 0x4, 10 ), /* str=0000000100 */
+
+        MKVLC( 0xf, 14 ), /* str=00000000001111 */
+        MKVLC( 0xe, 14 ), /* str=00000000001110 */
+        MKVLC( 0x9, 13 ), /* str=0000000001001 */
+        MKVLC( 0x4, 11 ), /* str=00000000100 */
+
+        MKVLC( 0xb, 14 ), /* str=00000000001011 */
+        MKVLC( 0xa, 14 ), /* str=00000000001010 */
+        MKVLC( 0xd, 14 ), /* str=00000000001101 */
+        MKVLC( 0xc, 13 ), /* str=0000000001100 */
+
+        MKVLC( 0xf, 15 ), /* str=000000000001111 */
+        MKVLC( 0xe, 15 ), /* str=000000000001110 */
+        MKVLC( 0x9, 14 ), /* str=00000000001001 */
+        MKVLC( 0xc, 14 ), /* str=00000000001100 */
+
+        MKVLC( 0xb, 15 ), /* str=000000000001011 */
+        MKVLC( 0xa, 15 ), /* str=000000000001010 */
+        MKVLC( 0xd, 15 ), /* str=000000000001101 */
+        MKVLC( 0x8, 14 ), /* str=00000000001000 */
+
+        MKVLC( 0xf, 16 ), /* str=0000000000001111 */
+        MKVLC( 0x1, 15 ), /* str=000000000000001 */
+        MKVLC( 0x9, 15 ), /* str=000000000001001 */
+        MKVLC( 0xc, 15 ), /* str=000000000001100 */
+
+        MKVLC( 0xb, 16 ), /* str=0000000000001011 */
+        MKVLC( 0xe, 16 ), /* str=0000000000001110 */
+        MKVLC( 0xd, 16 ), /* str=0000000000001101 */
+        MKVLC( 0x8, 15 ), /* str=000000000001000 */
+
+        MKVLC( 0x7, 16 ), /* str=0000000000000111 */
+        MKVLC( 0xa, 16 ), /* str=0000000000001010 */
+        MKVLC( 0x9, 16 ), /* str=0000000000001001 */
+        MKVLC( 0xc, 16 ), /* str=0000000000001100 */
+
+        MKVLC( 0x4, 16 ), /* str=0000000000000100 */
+        MKVLC( 0x6, 16 ), /* str=0000000000000110 */
+        MKVLC( 0x5, 16 ), /* str=0000000000000101 */
+        MKVLC( 0x8, 16 ), /* str=0000000000001000 */
+    },
+
+    /* table 1 */
+    {
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xb, 6 ), /* str=001011 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 6 ), /* str=000111 */
+        MKVLC( 0x7, 5 ), /* str=00111 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 7 ), /* str=0000111 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+
+        MKVLC( 0x7, 8 ), /* str=00000111 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+
+        MKVLC( 0x4, 8 ), /* str=00000100 */
+        MKVLC( 0x6, 7 ), /* str=0000110 */
+        MKVLC( 0x5, 7 ), /* str=0000101 */
+        MKVLC( 0x6, 5 ), /* str=00110 */
+
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x6, 8 ), /* str=00000110 */
+        MKVLC( 0x5, 8 ), /* str=00000101 */
+        MKVLC( 0x8, 6 ), /* str=001000 */
+
+        MKVLC( 0xf, 11 ), /* str=00000001111 */
+        MKVLC( 0x6, 9 ), /* str=000000110 */
+        MKVLC( 0x5, 9 ), /* str=000000101 */
+        MKVLC( 0x4, 6 ), /* str=000100 */
+
+        MKVLC( 0xb, 11 ), /* str=00000001011 */
+        MKVLC( 0xe, 11 ), /* str=00000001110 */
+        MKVLC( 0xd, 11 ), /* str=00000001101 */
+        MKVLC( 0x4, 7 ), /* str=0000100 */
+
+        MKVLC( 0xf, 12 ), /* str=000000001111 */
+        MKVLC( 0xa, 11 ), /* str=00000001010 */
+        MKVLC( 0x9, 11 ), /* str=00000001001 */
+        MKVLC( 0x4, 9 ), /* str=000000100 */
+
+        MKVLC( 0xb, 12 ), /* str=000000001011 */
+        MKVLC( 0xe, 12 ), /* str=000000001110 */
+        MKVLC( 0xd, 12 ), /* str=000000001101 */
+        MKVLC( 0xc, 11 ), /* str=00000001100 */
+
+        MKVLC( 0x8, 12 ), /* str=000000001000 */
+        MKVLC( 0xa, 12 ), /* str=000000001010 */
+        MKVLC( 0x9, 12 ), /* str=000000001001 */
+        MKVLC( 0x8, 11 ), /* str=00000001000 */
+
+        MKVLC( 0xf, 13 ), /* str=0000000001111 */
+        MKVLC( 0xe, 13 ), /* str=0000000001110 */
+        MKVLC( 0xd, 13 ), /* str=0000000001101 */
+        MKVLC( 0xc, 12 ), /* str=000000001100 */
+
+        MKVLC( 0xb, 13 ), /* str=0000000001011 */
+        MKVLC( 0xa, 13 ), /* str=0000000001010 */
+        MKVLC( 0x9, 13 ), /* str=0000000001001 */
+        MKVLC( 0xc, 13 ), /* str=0000000001100 */
+
+        MKVLC( 0x7, 13 ), /* str=0000000000111 */
+        MKVLC( 0xb, 14 ), /* str=00000000001011 */
+        MKVLC( 0x6, 13 ), /* str=0000000000110 */
+        MKVLC( 0x8, 13 ), /* str=0000000001000 */
+
+        MKVLC( 0x9, 14 ), /* str=00000000001001 */
+        MKVLC( 0x8, 14 ), /* str=00000000001000 */
+        MKVLC( 0xa, 14 ), /* str=00000000001010 */
+        MKVLC( 0x1, 13 ), /* str=0000000000001 */
+
+        MKVLC( 0x7, 14 ), /* str=00000000000111 */
+        MKVLC( 0x6, 14 ), /* str=00000000000110 */
+        MKVLC( 0x5, 14 ), /* str=00000000000101 */
+        MKVLC( 0x4, 14 ), /* str=00000000000100 */
+    },
+    /* table 2 */
+    {
+        MKVLC( 0xf, 4 ), /* str=1111 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xf, 6 ), /* str=001111 */
+        MKVLC( 0xe, 4 ), /* str=1110 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xb, 6 ), /* str=001011 */
+        MKVLC( 0xf, 5 ), /* str=01111 */
+        MKVLC( 0xd, 4 ), /* str=1101 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x8, 6 ), /* str=001000 */
+        MKVLC( 0xc, 5 ), /* str=01100 */
+        MKVLC( 0xe, 5 ), /* str=01110 */
+        MKVLC( 0xc, 4 ), /* str=1100 */
+
+        MKVLC( 0xf, 7 ), /* str=0001111 */
+        MKVLC( 0xa, 5 ), /* str=01010 */
+        MKVLC( 0xb, 5 ), /* str=01011 */
+        MKVLC( 0xb, 4 ), /* str=1011 */
+
+        MKVLC( 0xb, 7 ), /* str=0001011 */
+        MKVLC( 0x8, 5 ), /* str=01000 */
+        MKVLC( 0x9, 5 ), /* str=01001 */
+        MKVLC( 0xa, 4 ), /* str=1010 */
+
+        MKVLC( 0x9, 7 ), /* str=0001001 */
+        MKVLC( 0xe, 6 ), /* str=001110 */
+        MKVLC( 0xd, 6 ), /* str=001101 */
+        MKVLC( 0x9, 4 ), /* str=1001 */
+
+        MKVLC( 0x8, 7 ), /* str=0001000 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0x8, 4 ), /* str=1000 */
+
+        MKVLC( 0xf, 8 ), /* str=00001111 */
+        MKVLC( 0xe, 7 ), /* str=0001110 */
+        MKVLC( 0xd, 7 ), /* str=0001101 */
+        MKVLC( 0xd, 5 ), /* str=01101 */
+
+        MKVLC( 0xb, 8 ), /* str=00001011 */
+        MKVLC( 0xe, 8 ), /* str=00001110 */
+        MKVLC( 0xa, 7 ), /* str=0001010 */
+        MKVLC( 0xc, 6 ), /* str=001100 */
+
+        MKVLC( 0xf, 9 ), /* str=000001111 */
+        MKVLC( 0xa, 8 ), /* str=00001010 */
+        MKVLC( 0xd, 8 ), /* str=00001101 */
+        MKVLC( 0xc, 7 ), /* str=0001100 */
+
+        MKVLC( 0xb, 9 ), /* str=000001011 */
+        MKVLC( 0xe, 9 ), /* str=000001110 */
+        MKVLC( 0x9, 8 ), /* str=00001001 */
+        MKVLC( 0xc, 8 ), /* str=00001100 */
+
+        MKVLC( 0x8, 9 ), /* str=000001000 */
+        MKVLC( 0xa, 9 ), /* str=000001010 */
+        MKVLC( 0xd, 9 ), /* str=000001101 */
+        MKVLC( 0x8, 8 ), /* str=00001000 */
+
+        MKVLC( 0xd, 10 ), /* str=0000001101 */
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x9, 9 ), /* str=000001001 */
+        MKVLC( 0xc, 9 ), /* str=000001100 */
+
+        MKVLC( 0x9, 10 ), /* str=0000001001 */
+        MKVLC( 0xc, 10 ), /* str=0000001100 */
+        MKVLC( 0xb, 10 ), /* str=0000001011 */
+        MKVLC( 0xa, 10 ), /* str=0000001010 */
+
+        MKVLC( 0x5, 10 ), /* str=0000000101 */
+        MKVLC( 0x8, 10 ), /* str=0000001000 */
+        MKVLC( 0x7, 10 ), /* str=0000000111 */
+        MKVLC( 0x6, 10 ), /* str=0000000110 */
+
+        MKVLC( 0x1, 10 ), /* str=0000000001 */
+        MKVLC( 0x4, 10 ), /* str=0000000100 */
+        MKVLC( 0x3, 10 ), /* str=0000000011 */
+        MKVLC( 0x2, 10 ), /* str=0000000010 */
+    },
+
+    /* table 3 */
+    {
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x8, 6 ), /* str=001000 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0xb, 6 ), /* str=001011 */
+
+        MKVLC( 0xc, 6 ), /* str=001100 */
+        MKVLC( 0xd, 6 ), /* str=001101 */
+        MKVLC( 0xe, 6 ), /* str=001110 */
+        MKVLC( 0xf, 6 ), /* str=001111 */
+
+        MKVLC( 0x10, 6 ), /* str=010000 */
+        MKVLC( 0x11, 6 ), /* str=010001 */
+        MKVLC( 0x12, 6 ), /* str=010010 */
+        MKVLC( 0x13, 6 ), /* str=010011 */
+
+        MKVLC( 0x14, 6 ), /* str=010100 */
+        MKVLC( 0x15, 6 ), /* str=010101 */
+        MKVLC( 0x16, 6 ), /* str=010110 */
+        MKVLC( 0x17, 6 ), /* str=010111 */
+
+        MKVLC( 0x18, 6 ), /* str=011000 */
+        MKVLC( 0x19, 6 ), /* str=011001 */
+        MKVLC( 0x1a, 6 ), /* str=011010 */
+        MKVLC( 0x1b, 6 ), /* str=011011 */
+
+        MKVLC( 0x1c, 6 ), /* str=011100 */
+        MKVLC( 0x1d, 6 ), /* str=011101 */
+        MKVLC( 0x1e, 6 ), /* str=011110 */
+        MKVLC( 0x1f, 6 ), /* str=011111 */
+
+        MKVLC( 0x20, 6 ), /* str=100000 */
+        MKVLC( 0x21, 6 ), /* str=100001 */
+        MKVLC( 0x22, 6 ), /* str=100010 */
+        MKVLC( 0x23, 6 ), /* str=100011 */
+
+        MKVLC( 0x24, 6 ), /* str=100100 */
+        MKVLC( 0x25, 6 ), /* str=100101 */
+        MKVLC( 0x26, 6 ), /* str=100110 */
+        MKVLC( 0x27, 6 ), /* str=100111 */
+
+        MKVLC( 0x28, 6 ), /* str=101000 */
+        MKVLC( 0x29, 6 ), /* str=101001 */
+        MKVLC( 0x2a, 6 ), /* str=101010 */
+        MKVLC( 0x2b, 6 ), /* str=101011 */
+
+        MKVLC( 0x2c, 6 ), /* str=101100 */
+        MKVLC( 0x2d, 6 ), /* str=101101 */
+        MKVLC( 0x2e, 6 ), /* str=101110 */
+        MKVLC( 0x2f, 6 ), /* str=101111 */
+
+        MKVLC( 0x30, 6 ), /* str=110000 */
+        MKVLC( 0x31, 6 ), /* str=110001 */
+        MKVLC( 0x32, 6 ), /* str=110010 */
+        MKVLC( 0x33, 6 ), /* str=110011 */
+
+        MKVLC( 0x34, 6 ), /* str=110100 */
+        MKVLC( 0x35, 6 ), /* str=110101 */
+        MKVLC( 0x36, 6 ), /* str=110110 */
+        MKVLC( 0x37, 6 ), /* str=110111 */
+
+        MKVLC( 0x38, 6 ), /* str=111000 */
+        MKVLC( 0x39, 6 ), /* str=111001 */
+        MKVLC( 0x3a, 6 ), /* str=111010 */
+        MKVLC( 0x3b, 6 ), /* str=111011 */
+
+        MKVLC( 0x3c, 6 ), /* str=111100 */
+        MKVLC( 0x3d, 6 ), /* str=111101 */
+        MKVLC( 0x3e, 6 ), /* str=111110 */
+        MKVLC( 0x3f, 6 ), /* str=111111 */
+    },
+
+    /* table 4 */
+    {
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 6 ), /* str=000111 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x3, 7 ), /* str=0000011 */
+        MKVLC( 0x2, 7 ), /* str=0000010 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x3, 8 ), /* str=00000011 */
+        MKVLC( 0x2, 8 ), /* str=00000010 */
+        MKVLC( 0x0, 7 ), /* str=0000000 */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    }
+};
+
+static const vlc_t x264_level_prefix[16] =
+{
+    MKVLC( 0x01,  1 ),
+    MKVLC( 0x01,  2 ),
+    MKVLC( 0x01,  3 ),
+    MKVLC( 0x01,  4 ),
+    MKVLC( 0x01,  5 ),
+    MKVLC( 0x01,  6 ),
+    MKVLC( 0x01,  7 ),
+    MKVLC( 0x01,  8 ),
+    MKVLC( 0x01,  9 ),
+    MKVLC( 0x01, 10 ),
+    MKVLC( 0x01, 11 ),
+    MKVLC( 0x01, 12 ),
+    MKVLC( 0x01, 13 ),
+    MKVLC( 0x01, 14 ),
+    MKVLC( 0x01, 15 ),
+    MKVLC( 0x01, 16 )
+};
+
+/* [i_total_coeff-1][i_total_zeros] */
+static const vlc_t x264_total_zeros[15][16] =
+{
+    { /* i_total 1 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x3, 7 ), /* str=0000011 */
+        MKVLC( 0x2, 7 ), /* str=0000010 */
+        MKVLC( 0x3, 8 ), /* str=00000011 */
+        MKVLC( 0x2, 8 ), /* str=00000010 */
+        MKVLC( 0x3, 9 ), /* str=000000011 */
+        MKVLC( 0x2, 9 ), /* str=000000010 */
+        MKVLC( 0x1, 9 ), /* str=000000001 */
+    },
+    { /* i_total 2 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 3 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 4 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 5 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 6 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 7 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 8 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 9 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 10 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 11 */
+        MKVLC( 0x0, 4 ), /* str=0000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 12 */
+        MKVLC( 0x0, 4 ), /* str=0000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 13 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 14 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 15 */
+        MKVLC( 0x0, 1 ), /* str=0 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+};
+
+/* [i_total_coeff-1][i_total_zeros] */
+static const vlc_t x264_total_zeros_dc[3][4] =
+{
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x01, 2 ), /* 01 */
+        MKVLC( 0x01, 3 ), /* 001*/
+        MKVLC( 0x00, 3 )  /* 000*/
+    },
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x01, 2 ), /* 01 */
+        MKVLC( 0x00, 2 ), /* 00 */
+        MKVLC( 0x00, 0 )  /*    */
+    },
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x00, 1 ), /* 0  */
+        MKVLC( 0x00, 0 ), /*    */
+        MKVLC( 0x00, 0 )  /*    */
+    }
+};
+
+/* x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] */
+static const vlc_t x264_run_before[7][15] =
+{
+    { /* i_zero_left 1 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 1 ), /* str=0 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 2 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 3 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 4 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 5 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 6 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 7 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 7 ), /* str=0000001 */
+        MKVLC( 0x1, 8 ), /* str=00000001 */
+        MKVLC( 0x1, 9 ), /* str=000000001 */
+        MKVLC( 0x1, 10 ), /* str=0000000001 */
+        MKVLC( 0x1, 11 ), /* str=00000000001 */
+    },
+};
diff --git a/decoder/decoder.c b/decoder/decoder.c
new file mode 100644
index 00000000..17c70327
--- /dev/null
+++ b/decoder/decoder.c
@@ -0,0 +1,772 @@
+/*****************************************************************************
+ * x264: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: decoder.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/cpu.h"
+#include "../core/vlc.h"
+
+#include "macroblock.h"
+#include "set.h"
+#include "vlc.h"
+
+
+static void x264_slice_idr( x264_t *h )
+{
+    int i;
+
+    h->i_poc_msb = 0;
+    h->i_poc_lsb = 0;
+    h->i_frame_offset = 0;
+    h->i_frame_num = 0;
+
+    if( h->sps )
+    {
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            h->freference[i]->i_poc = -1;
+        }
+
+        h->fdec = h->freference[0];
+        h->i_ref0 = 0;
+        h->i_ref1 = 0;
+    }
+}
+
+/* The slice reading is split in two part:
+ *      - before ref_pic_list_reordering( )
+ *      - after  dec_ref_pic_marking( )
+ */
+static int x264_slice_header_part1_read( bs_t *s,
+                                         x264_slice_header_t *sh, x264_sps_t sps_array[32], x264_pps_t pps_array[256], int b_idr )
+{
+    sh->i_first_mb = bs_read_ue( s );
+    sh->i_type = bs_read_ue( s );
+    if( sh->i_type >= 5 )
+    {
+        sh->i_type -= 5;
+    }
+    sh->i_pps_id = bs_read_ue( s );
+    if( bs_eof( s ) || sh->i_pps_id >= 256 || pps_array[sh->i_pps_id].i_id == -1 )
+    {
+        fprintf( stderr, "invalid pps_id in slice header\n" );
+        return -1;
+    }
+
+    sh->pps = &pps_array[sh->i_pps_id];
+    sh->sps = &sps_array[sh->pps->i_sps_id];    /* valid if pps valid */
+
+    sh->i_frame_num = bs_read( s, sh->sps->i_log2_max_frame_num );
+    if( !sh->sps->b_frame_mbs_only )
+    {
+        sh->b_field_pic = bs_read1( s );
+        if( sh->b_field_pic )
+        {
+            sh->b_bottom_field = bs_read1( s );
+        }
+    }
+
+    if( b_idr )
+    {
+        sh->i_idr_pic_id = bs_read_ue( s );
+    }
+    else
+    {
+        sh->i_idr_pic_id = 0;
+    }
+
+    if( sh->sps->i_poc_type == 0 )
+    {
+        sh->i_poc_lsb = bs_read( s, sh->sps->i_log2_max_poc_lsb );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            sh->i_delta_poc_bottom = bs_read_se( s );
+        }
+    }
+    else if( sh->sps->i_poc_type == 1 && !sh->sps->b_delta_pic_order_always_zero )
+    {
+        sh->i_delta_poc[0] = bs_read_se( s );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            sh->i_delta_poc[1] = bs_read_se( s );
+        }
+    }
+
+    if( sh->pps->b_redundant_pic_cnt )
+    {
+        sh->i_redundant_pic_cnt = bs_read_ue( s );
+    }
+
+    if( sh->i_type == SLICE_TYPE_B )
+    {
+        sh->b_direct_spatial_mv_pred = bs_read1( s );
+    }
+
+    if( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP || sh->i_type == SLICE_TYPE_B )
+    {
+        sh->b_num_ref_idx_override = bs_read1( s );
+
+        sh->i_num_ref_idx_l0_active = sh->pps->i_num_ref_idx_l0_active; /* default */
+        sh->i_num_ref_idx_l1_active = sh->pps->i_num_ref_idx_l1_active; /* default */
+
+        if( sh->b_num_ref_idx_override )
+        {
+            sh->i_num_ref_idx_l0_active = bs_read_ue( s ) + 1;
+            if( sh->i_type == SLICE_TYPE_B )
+            {
+                sh->i_num_ref_idx_l1_active = bs_read_ue( s ) + 1;
+            }
+        }
+    }
+
+    return bs_eof( s ) ? -1 : 0;
+}
+
+static int x264_slice_header_part2_read( bs_t *s, x264_slice_header_t *sh )
+{
+    if( sh->pps->b_cabac && sh->i_type != SLICE_TYPE_I && sh->i_type != SLICE_TYPE_SI )
+    {
+        sh->i_cabac_init_idc = bs_read_ue( s );
+    }
+    sh->i_qp_delta = bs_read_se( s );
+
+    if( sh->i_type == SLICE_TYPE_SI || sh->i_type == SLICE_TYPE_SP )
+    {
+        if( sh->i_type == SLICE_TYPE_SP )
+        {
+            sh->b_sp_for_swidth = bs_read1( s );
+        }
+        sh->i_qs_delta = bs_read_se( s );
+    }
+
+    if( sh->pps->b_deblocking_filter_control )
+    {
+        sh->i_disable_deblocking_filter_idc = bs_read_ue( s );
+        if( sh->i_disable_deblocking_filter_idc != 1 )
+        {
+            sh->i_alpha_c0_offset = bs_read_se( s );
+            sh->i_beta_offset = bs_read_se( s );
+        }
+    }
+    else
+    {
+        sh->i_alpha_c0_offset = 0;
+        sh->i_beta_offset = 0;
+    }
+
+    if( sh->pps->i_num_slice_groups > 1 && sh->pps->i_slice_group_map_type >= 3 && sh->pps->i_slice_group_map_type <= 5 )
+    {
+        /* FIXME */
+        return -1;
+    }
+    return 0;
+}
+
+static int x264_slice_header_ref_pic_reordering( x264_t *h, bs_t *s )
+{
+    int b_ok;
+    int i;
+
+    /* use the no more use frame */
+    h->fdec = h->freference[0];
+    h->fdec->i_poc = h->i_poc;
+
+    /* build ref list 0/1 */
+    h->i_ref0 = 0;
+    h->i_ref1 = 0;
+    for( i = 1; i < h->sps->i_num_ref_frames + 1; i++ )
+    {
+        if( h->freference[i]->i_poc >= 0 )
+        {
+            if( h->freference[i]->i_poc < h->fdec->i_poc )
+            {
+                h->fref0[h->i_ref0++] = h->freference[i];
+            }
+            else if( h->freference[i]->i_poc > h->fdec->i_poc )
+            {
+                h->fref1[h->i_ref1++] = h->freference[i];
+            }
+        }
+    }
+
+    /* Order ref0 from higher to lower poc */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref0 - 1; i++ )
+        {
+            if( h->fref0[i]->i_poc < h->fref0[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref0[i+1];
+
+                h->fref0[i+1] = h->fref0[i];
+                h->fref0[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+    /* Order ref1 from lower to higher poc (bubble sort) for B-frame */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref1 - 1; i++ )
+        {
+            if( h->fref1[i]->i_poc > h->fref1[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref1[i+1];
+
+                h->fref1[i+1] = h->fref1[i];
+                h->fref1[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+
+    if( h->i_ref0 > h->pps->i_num_ref_idx_l0_active )
+    {
+        h->i_ref0 = h->pps->i_num_ref_idx_l0_active;
+    }
+    if( h->i_ref1 > h->pps->i_num_ref_idx_l1_active )
+    {
+        h->i_ref1 = h->pps->i_num_ref_idx_l1_active;
+    }
+
+    //fprintf( stderr,"POC:%d ref0=%d POC0=%d\n", h->fdec->i_poc, h->i_ref0, h->i_ref0 > 0 ? h->fref0[0]->i_poc : -1 );
+
+
+    /* Now parse the stream and change the default order */
+    if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+    {
+        int b_reorder = bs_read1( s );
+
+        if( b_reorder )
+        {
+            /* FIXME */
+            return -1;
+        }
+    }
+    if( h->sh.i_type == SLICE_TYPE_B )
+    {
+        int b_reorder = bs_read1( s );
+        if( b_reorder )
+        {
+            /* FIXME */
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static int x264_slice_header_pred_weight_table( x264_t *h, bs_t *s )
+{
+    return -1;
+}
+
+static int  x264_slice_header_dec_ref_pic_marking( x264_t *h, bs_t *s, int i_nal_type  )
+{
+    if( i_nal_type == NAL_SLICE_IDR )
+    {
+        int b_no_output_of_prior_pics = bs_read1( s );
+        int b_long_term_reference_flag = bs_read1( s );
+
+        /* TODO */
+        if( b_no_output_of_prior_pics )
+        {
+
+        }
+
+        if( b_long_term_reference_flag )
+        {
+
+        }
+    }
+    else
+    {
+        int b_adaptive_ref_pic_marking_mode = bs_read1( s );
+        if( b_adaptive_ref_pic_marking_mode )
+        {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/****************************************************************************
+ * Decode a slice header and setup h for mb decoding.
+ ****************************************************************************/
+static int x264_slice_header_decode( x264_t *h, bs_t *s, x264_nal_t *nal )
+{
+    /* read the first part of the slice */
+    if( x264_slice_header_part1_read( s, &h->sh,
+                                      h->sps_array, h->pps_array,
+                                      nal->i_type == NAL_SLICE_IDR ? 1 : 0 ) < 0 )
+    {
+        fprintf( stderr, "x264_slice_header_part1_read failed\n" );
+        return -1;
+    }
+
+    /* now reset h if needed for this frame */
+    if( h->sps != h->sh.sps || h->pps != h->sh.pps )
+    {
+        int i;
+        /* TODO */
+
+        h->sps = NULL;
+        h->pps = NULL;
+        if( h->picture->i_width != 0 && h->picture->i_height != 0 )
+        {
+            for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+            {
+                x264_frame_delete( h->freference[i]);
+            }
+            free( h->mb );
+        }
+
+        h->picture->i_width = 0;
+        h->picture->i_height = 0;
+    }
+
+    /* and init if needed */
+    if( h->sps == NULL || h->pps == NULL )
+    {
+        int i;
+
+        h->sps = h->sh.sps;
+        h->pps = h->sh.pps;
+
+        h->param.i_width = h->picture->i_width = 16 * h->sps->i_mb_width;
+        h->param.i_height= h->picture->i_height= 16 * h->sps->i_mb_height;
+
+        fprintf( stderr, "x264: %dx%d\n", h->picture->i_width, h->picture->i_height );
+
+        h->mb = x264_macroblocks_new( h->sps->i_mb_width, h->sps->i_mb_height );
+
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            h->freference[i] = x264_frame_new( h );
+            h->freference[i]->i_poc = -1;
+        }
+        h->fdec = h->freference[0];
+        h->i_ref0 = 0;
+        h->i_ref1 = 0;
+
+        h->i_poc_msb = 0;
+        h->i_poc_lsb = 0;
+        h->i_frame_offset = 0;
+        h->i_frame_num = 0;
+    }
+
+    /* calculate poc for current frame */
+    if( h->sps->i_poc_type == 0 )
+    {
+        int i_max_poc_lsb = 1 << h->sps->i_log2_max_poc_lsb;
+
+        if( h->sh.i_poc_lsb < h->i_poc_lsb && h->i_poc_lsb - h->sh.i_poc_lsb >= i_max_poc_lsb/2 )
+        {
+            h->i_poc_msb += i_max_poc_lsb;
+        }
+        else if( h->sh.i_poc_lsb > h->i_poc_lsb  && h->sh.i_poc_lsb - h->i_poc_lsb > i_max_poc_lsb/2 )
+        {
+            h->i_poc_msb -= i_max_poc_lsb;
+        }
+        h->i_poc_lsb = h->sh.i_poc_lsb;
+
+        h->i_poc = h->i_poc_msb + h->sh.i_poc_lsb;
+    }
+    else if( h->sps->i_poc_type == 1 )
+    {
+        /* FIXME */
+        return -1;
+    }
+    else
+    {
+        if( nal->i_type == NAL_SLICE_IDR )
+        {
+            h->i_frame_offset = 0;
+            h->i_poc = 0;
+        }
+        else
+        {
+            if( h->sh.i_frame_num < h->i_frame_num )
+            {
+                h->i_frame_offset += 1 << h->sps->i_log2_max_frame_num;
+            }
+            if( nal->i_ref_idc > 0 )
+            {
+                h->i_poc = 2 * ( h->i_frame_offset + h->sh.i_frame_num );
+            }
+            else
+            {
+                h->i_poc = 2 * ( h->i_frame_offset + h->sh.i_frame_num ) - 1;
+            }
+        }
+        h->i_frame_num = h->sh.i_frame_num;
+    }
+
+    fprintf( stderr, "x264: pic type=%s poc:%d\n",
+             h->sh.i_type == SLICE_TYPE_I ? "I" : (h->sh.i_type == SLICE_TYPE_P ? "P" : "B?" ),
+             h->i_poc );
+
+    if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_P )
+    {
+        fprintf( stderr, "only SLICE I/P supported\n" );
+        return -1;
+    }
+
+    /* read and do the ref pic reordering */
+    if( x264_slice_header_ref_pic_reordering( h, s ) < 0 )
+    {
+        return -1;
+    }
+
+    if( ( (h->sh.i_type == SLICE_TYPE_P || h->sh.i_type == SLICE_TYPE_SP) && h->sh.pps->b_weighted_pred  ) ||
+        ( h->sh.i_type == SLICE_TYPE_B && h->sh.pps->b_weighted_bipred ) )
+    {
+        if( x264_slice_header_pred_weight_table( h, s ) < 0 )
+        {
+            return -1;
+        }
+    }
+
+    if( nal->i_ref_idc != 0 )
+    {
+        x264_slice_header_dec_ref_pic_marking( h, s, nal->i_type );
+    }
+
+    if( x264_slice_header_part2_read( s, &h->sh ) < 0 )
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+static int x264_slice_data_decode( x264_t *h, bs_t *s )
+{
+    int mb_xy = h->sh.i_first_mb;
+    int i_ret = 0;
+
+    if( h->pps->b_cabac )
+    {
+        /* TODO: alignement and cabac init */
+    }
+
+    /* FIXME field decoding */
+    for( ;; )
+    {
+        x264_mb_context_t context;
+        x264_macroblock_t *mb;
+
+        if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+        {
+            break;
+        }
+
+        mb = &h->mb[mb_xy];
+
+        /* load neighbour */
+        x264_macroblock_context_load( h, mb, &context );
+
+
+        if( h->pps->b_cabac )
+        {
+            if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+            {
+                /* TODO */
+            }
+            i_ret = x264_macroblock_read_cabac( h, s, mb );
+        }
+        else
+        {
+            if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+            {
+                int i_skip = bs_read_ue( s );
+
+                while( i_skip > 0 )
+                {
+                    x264_macroblock_decode_skip( h, mb );
+
+                    /* next macroblock */
+                    mb_xy++;
+                    if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+                    {
+                        break;
+                    }
+                    mb++;
+
+                    /* load neighbour */
+                    x264_macroblock_context_load( h, mb, &context );
+
+                    i_skip--;
+                }
+                if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+                {
+                    break;
+                }
+            }
+            i_ret = x264_macroblock_read_cavlc( h, s, mb );
+        }
+
+        if( i_ret < 0 )
+        {
+            fprintf( stderr, "x264_macroblock_read failed [%d,%d]\n", mb->i_mb_x, mb->i_mb_y );
+            break;
+        }
+
+        if( x264_macroblock_decode( h, mb ) < 0 )
+        {
+            fprintf( stderr, "x264_macroblock_decode failed\n" );
+            /* try to do some error correction ;) */
+        }
+
+        mb_xy++;
+    }
+
+    if( i_ret >= 0 )
+    {
+        int i;
+
+        /* expand border for frame reference TODO avoid it when using b-frame */
+        x264_frame_expand_border( h->fdec );
+
+        /* apply deblocking filter to the current decoded picture */
+        if( !h->pps->b_deblocking_filter_control || h->sh.i_disable_deblocking_filter_idc != 1 )
+        {
+            x264_frame_deblocking_filter( h, h->sh.i_type );
+        }
+
+#if 0
+        /* expand border for frame reference TODO avoid it when using b-frame */
+        x264_frame_expand_border( h->fdec );
+#endif
+
+        h->picture->i_plane = h->fdec->i_plane;
+        for( i = 0; i < h->picture->i_plane; i++ )
+        {
+            h->picture->i_stride[i] = h->fdec->i_stride[i];
+            h->picture->plane[i]    = h->fdec->plane[i];
+        }
+
+        /* move frame in the buffer FIXME won't work for B-frame */
+        h->fdec = h->freference[h->sps->i_num_ref_frames];
+        for( i = h->sps->i_num_ref_frames; i > 0; i-- )
+        {
+            h->freference[i] = h->freference[i-1];
+        }
+        h->freference[0] = h->fdec;
+    }
+
+    return i_ret;
+}
+
+/****************************************************************************
+ *
+ ******************************* x264 libs **********************************
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * x264_decoder_open:
+ ****************************************************************************/
+x264_t *x264_decoder_open   ( x264_param_t *param )
+{
+    x264_t *h = x264_malloc( sizeof( x264_t ) );
+    int i;
+
+    memcpy( &h->param, param, sizeof( x264_param_t ) );
+
+    h->cpu = param->cpu;
+
+    /* no SPS and PPS active yet */
+    h->sps = NULL;
+    h->pps = NULL;
+
+    for( i = 0; i < 32; i++ )
+    {
+        h->sps_array[i].i_id = -1;  /* invalidate it */
+    }
+    for( i = 0; i < 256; i++ )
+    {
+        h->pps_array[i].i_id = -1;  /* invalidate it */
+    }
+
+    h->picture = x264_malloc( sizeof( x264_picture_t ) );
+    h->picture->i_width = 0;
+    h->picture->i_height= 0;
+
+    /* init predict_XxX */
+    x264_predict_16x16_init( h->cpu, h->predict_16x16 );
+    x264_predict_8x8_init( h->cpu, h->predict_8x8 );
+    x264_predict_4x4_init( h->cpu, h->predict_4x4 );
+
+    x264_pixel_init( h->cpu, &h->pixf );
+    x264_dct_init( h->cpu, &h->dctf );
+
+    x264_mc_init( h->cpu, h->mc );
+
+    /* create the vlc table (we could remove it from x264_t but it will need
+     * to introduce a x264_init() for global librarie) */
+    for( i = 0; i < 5; i++ )
+    {
+        /* max 2 step */
+        h->x264_coeff_token_lookup[i] = x264_vlc_table_lookup_new( x264_coeff_token[i], 17*4, 4 );
+    }
+    /* max 2 step */
+    h->x264_level_prefix_lookup = x264_vlc_table_lookup_new( x264_level_prefix, 16, 8 );
+
+    for( i = 0; i < 15; i++ )
+    {
+        /* max 1 step */
+        h->x264_total_zeros_lookup[i] = x264_vlc_table_lookup_new( x264_total_zeros[i], 16, 9 );
+    }
+    for( i = 0;i < 3; i++ )
+    {
+        /* max 1 step */
+        h->x264_total_zeros_dc_lookup[i] = x264_vlc_table_lookup_new( x264_total_zeros_dc[i], 4, 3 );
+    }
+    for( i = 0;i < 7; i++ )
+    {
+        /* max 2 step */
+        h->x264_run_before_lookup[i] = x264_vlc_table_lookup_new( x264_run_before[i], 15, 6 );
+    }
+
+    return h;
+}
+
+/****************************************************************************
+ * x264_decoder_decode: decode one nal unit
+ ****************************************************************************/
+int     x264_decoder_decode( x264_t *h,
+                             x264_picture_t **pp_pic, x264_nal_t *nal )
+{
+    int i_ret = 0;
+    bs_t bs;
+
+    /* no picture */
+    *pp_pic = NULL;
+
+    /* init bitstream reader */
+    bs_init( &bs, nal->p_payload, nal->i_payload );
+
+    switch( nal->i_type )
+    {
+        case NAL_SPS:
+            if( ( i_ret = x264_sps_read( &bs, h->sps_array ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_sps_read failed\n" );
+            }
+            break;
+
+        case NAL_PPS:
+            if( ( i_ret = x264_pps_read( &bs, h->pps_array ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_pps_read failed\n" );
+            }
+            break;
+
+        case NAL_SLICE_IDR:
+            fprintf( stderr, "x264: NAL_SLICE_IDR\n" );
+            x264_slice_idr( h );
+
+        case NAL_SLICE:
+            if( ( i_ret = x264_slice_header_decode( h, &bs, nal ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_slice_header_decode failed\n" );
+            }
+            if( h->sh.i_redundant_pic_cnt == 0 && i_ret == 0 )
+            {
+                if( ( i_ret = x264_slice_data_decode( h, &bs ) ) < 0 )
+                {
+                    fprintf( stderr, "x264: x264_slice_data_decode failed\n" );
+                }
+                else
+                {
+                    *pp_pic = h->picture;
+                }
+            }
+            break;
+
+        case NAL_SLICE_DPA:
+        case NAL_SLICE_DPB:
+        case NAL_SLICE_DPC:
+            fprintf( stderr, "partitioned stream unsupported\n" );
+            i_ret = -1;
+            break;
+
+        case NAL_SEI:
+        default:
+            break;
+    }
+
+    /* restore CPU state (before using float again) */
+    x264_cpu_restore( h->cpu );
+
+    return i_ret;
+}
+
+/****************************************************************************
+ * x264_decoder_close:
+ ****************************************************************************/
+void    x264_decoder_close  ( x264_t *h )
+{
+    int i;
+
+    if( h->picture->i_width != 0 && h->picture->i_height != 0 )
+    {
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            x264_frame_delete( h->freference[i]);
+        }
+        x264_free( h->mb );
+    }
+
+    /* free vlc table */
+    for( i = 0; i < 5; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_coeff_token_lookup[i] );
+    }
+    x264_vlc_table_lookup_delete( h->x264_level_prefix_lookup );
+
+    for( i = 0; i < 15; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_total_zeros_lookup[i] );
+    }
+    for( i = 0;i < 3; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_total_zeros_dc_lookup[i] );
+    }
+    for( i = 0;i < 7; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_run_before_lookup[i] );
+    }
+
+    x264_free( h->picture );
+    x264_free( h );
+}
+
diff --git a/decoder/macroblock.c b/decoder/macroblock.c
new file mode 100644
index 00000000..0d580e45
--- /dev/null
+++ b/decoder/macroblock.c
@@ -0,0 +1,1097 @@
+/*****************************************************************************
+ * macroblock.c: h264 decoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "vlc.h"
+#include "macroblock.h"
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int golomb_to_intra4x4_cbp[48]=
+{
+    47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46,
+    16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4,
+     8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41
+};
+static const int golomb_to_inter_cbp[48]=
+{
+     0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13,
+    14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46,
+    17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
+};
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+
+void x264_mb_partition_ref_set( x264_macroblock_t *mb, int i_list, int i_part, int i_ref )
+{
+    int x,  y;
+    int w,  h;
+    int dx, dy;
+
+    x264_mb_partition_getxy( mb, i_part, 0, &x, &y );
+    if( mb->i_partition == D_16x16 )
+    {
+        w = 4; h = 4;
+    }
+    else if( mb->i_partition == D_16x8 )
+    {
+        w = 4; h = 2;
+    }
+    else if( mb->i_partition == D_8x16 )
+    {
+        w = 2; h = 4;
+    }
+    else
+    {
+        /* D_8x8 */
+        w = 2; h = 2;
+    }
+
+    for( dx = 0; dx < w; dx++ )
+    {
+        for( dy = 0; dy < h; dy++ )
+        {
+            mb->partition[x+dx][y+dy].i_ref[i_list] = i_ref;
+        }
+    }
+}
+
+void x264_mb_partition_mv_set( x264_macroblock_t *mb, int i_list, int i_part, int i_sub, int mv[2] )
+{
+    int x,  y;
+    int w,  h;
+    int dx, dy;
+
+    x264_mb_partition_getxy( mb, i_part, i_sub, &x, &y );
+    x264_mb_partition_size ( mb, i_part, i_sub, &w, &h );
+
+    for( dx = 0; dx < w; dx++ )
+    {
+        for( dy = 0; dy < h; dy++ )
+        {
+            mb->partition[x+dx][y+dy].mv[i_list][0] = mv[0];
+            mb->partition[x+dx][y+dy].mv[i_list][1] = mv[1];
+        }
+    }
+}
+
+
+int x264_macroblock_read_cabac( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    return -1;
+}
+
+static int x264_macroblock_decode_ipcm( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    /* TODO */
+    return -1;
+}
+
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+static int bs_read_vlc( bs_t *s, x264_vlc_table_t *table )
+{
+    int i_nb_bits;
+    int i_value = 0;
+    int i_bits;
+    int i_index;
+    int i_level = 0;
+
+    i_index = bs_show( s, table->i_lookup_bits );
+    if( i_index >= table->i_lookup )
+    {
+        return( -1 );
+    }
+    i_value = table->lookup[i_index].i_value;
+    i_bits  = table->lookup[i_index].i_size;
+
+    while( i_bits < 0 )
+    {
+        i_level++;
+        if( i_level > 5 )
+        {
+            return( -1 );        // FIXME what to do ?
+        }
+        bs_skip( s, table->i_lookup_bits );
+        i_nb_bits = -i_bits;
+
+        i_index = bs_show( s, i_nb_bits ) + i_value;
+        if( i_index >= table->i_lookup )
+        {
+            return( -1 );
+        }
+        i_value = table->lookup[i_index].i_value;
+        i_bits  = table->lookup[i_index].i_size;
+    }
+    bs_skip( s, i_bits );
+
+    return( i_value );
+}
+
+static int block_residual_read_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb,
+                                      int i_idx, int *l, int i_count )
+{
+    int i;
+    int level[16], run[16];
+    int i_coeff;
+
+    int i_total, i_trailing;
+    int i_suffix_length;
+    int i_zero_left;
+
+    for( i = 0; i < i_count; i++ )
+    {
+        l[i] = 0;
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        int i_tt;
+
+        if( ( i_tt = bs_read_vlc( s, h->x264_coeff_token_lookup[4] )) < 0 )
+        {
+            return -1;
+        }
+
+        i_total = i_tt / 4;
+        i_trailing = i_tt % 4;
+    }
+    else
+    {
+        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+        int i_tt;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = x264_mb_predict_non_zero_code( h, mb, 0 );
+        }
+        else
+        {
+            nC = x264_mb_predict_non_zero_code( h, mb, i_idx );
+        }
+
+        if( ( i_tt = bs_read_vlc( s, h->x264_coeff_token_lookup[ct_index[nC]] ) ) < 0 )
+        {
+            return -1;
+        }
+
+        i_total = i_tt / 4;
+        i_trailing = i_tt % 4;
+    }
+
+    if( i_idx >= 0 )
+    {
+        mb->block[i_idx].i_non_zero_count = i_total;
+    }
+
+    if( i_total <= 0 )
+    {
+        return 0;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+
+    for( i = 0; i < i_trailing; i++ )
+    {
+        level[i] = 1 - 2 * bs_read1( s );
+    }
+
+    for( ; i < i_total; i++ )
+    {
+        int i_prefix;
+        int i_level_code;
+
+        i_prefix = bs_read_vlc( s, h->x264_level_prefix_lookup );
+
+        if( i_prefix == -1 )
+        {
+            return -1;
+        }
+        else if( i_prefix < 14 )
+        {
+            if( i_suffix_length > 0 )
+            {
+                i_level_code = (i_prefix << i_suffix_length) + bs_read( s, i_suffix_length );
+            }
+            else
+            {
+                i_level_code = i_prefix;
+            }
+        }
+        else if( i_prefix == 14 )
+        {
+            if( i_suffix_length > 0 )
+            {
+                i_level_code = (i_prefix << i_suffix_length) + bs_read( s, i_suffix_length );
+            }
+            else
+            {
+                i_level_code = i_prefix + bs_read( s, 4 );
+            }
+        }
+        else /* if( i_prefix == 15 ) */
+        {
+            i_level_code = (i_prefix << i_suffix_length) + bs_read( s, 12 );
+            if( i_suffix_length == 0 )
+            {
+                i_level_code += 15;
+            }
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code += 2;
+        }
+        /* Optimise */
+        level[i] = i_level_code&0x01 ? -((i_level_code+1)/2) : (i_level_code+2)/2;
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            i_zero_left = bs_read_vlc( s, h->x264_total_zeros_dc_lookup[i_total-1] );
+        }
+        else
+        {
+            i_zero_left = bs_read_vlc( s, h->x264_total_zeros_lookup[i_total-1] );
+        }
+        if( i_zero_left < 0 )
+        {
+            return -1;
+        }
+    }
+    else
+    {
+        i_zero_left = 0;
+    }
+
+    for( i = 0; i < i_total - 1; i++ )
+    {
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+        run[i] = bs_read_vlc( s, h->x264_run_before_lookup[X264_MIN( i_zero_left - 1, 6 )] );
+
+        if( run[i] < 0 )
+        {
+            return -1;
+        }
+        i_zero_left -= run[i];
+    }
+    if( i_zero_left < 0 )
+    {
+        return -1;
+    }
+
+    for( ; i < i_total - 1; i++ )
+    {
+        run[i] = 0;
+    }
+    run[i_total-1] = i_zero_left;
+
+    i_coeff = -1;
+    for( i = i_total - 1; i >= 0; i-- )
+    {
+        i_coeff += run[i] + 1;
+        l[i_coeff] = level[i];
+    }
+
+    return 0;
+}
+
+static inline void array_zero_set( int *l, int i_count )
+{
+    int i;
+
+    for( i = 0; i < i_count; i++ )
+    {
+        l[i] = 0;
+    }
+}
+
+int x264_macroblock_read_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    int i_mb_i_offset;
+    int i_mb_p_offset;
+    int b_sub_ref0 = 0;
+    int i_type;
+    int i;
+
+    /* read the mb type */
+    switch( h->sh.i_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_p_offset = 0;  /* shut up gcc */
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_p_offset = 0;
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_p_offset = 23;
+            i_mb_i_offset = 23 + 5;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return -1;
+    }
+
+    i_type = bs_read_ue( s );
+
+    if( i_type < i_mb_i_offset )
+    {
+        if( i_type < i_mb_p_offset )
+        {
+            fprintf( stderr, "unsupported mb type(B*)\n" );
+            /* TODO for B frame */
+            return -1;
+        }
+        else
+        {
+            i_type -= i_mb_p_offset;
+
+            if( i_type == 0 )
+            {
+                mb->i_type = P_L0;
+                mb->i_partition = D_16x16;
+            }
+            else if( i_type == 1 )
+            {
+                mb->i_type = P_L0;
+                mb->i_partition = D_16x8;
+            }
+            else if( i_type == 2 )
+            {
+                mb->i_type = P_L0;
+                mb->i_partition = D_8x16;
+            }
+            else if( i_type == 3 || i_type == 4 )
+            {
+                mb->i_type = P_8x8;
+                mb->i_partition = D_8x8;
+                b_sub_ref0 = i_type == 4 ? 1 : 0;
+            }
+            else
+            {
+                fprintf( stderr, "invalid mb type\n" );
+                return -1;
+            }
+        }
+    }
+    else
+    {
+        i_type -= i_mb_i_offset;
+
+        if( i_type == 0 )
+        {
+            mb->i_type = I_4x4;
+        }
+        else if( i_type < 25 )
+        {
+            mb->i_type = I_16x16;
+
+            mb->i_intra16x16_pred_mode = (i_type - 1)%4;
+            mb->i_cbp_chroma = ( (i_type-1) / 4 )%3;
+            mb->i_cbp_luma   = ((i_type-1) / 12) ? 0x0f : 0x00;
+        }
+        else if( i_type == 25 )
+        {
+            mb->i_type = I_PCM;
+        }
+        else
+        {
+            fprintf( stderr, "invalid mb type (%d)\n", i_type );
+            return -1;
+        }
+    }
+
+    if( mb->i_type == I_PCM )
+    {
+        return x264_macroblock_decode_ipcm( h, s, mb );
+    }
+
+    if( IS_INTRA( mb->i_type ) )
+    {
+        if( mb->i_type == I_4x4 )
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                int b_coded;
+
+                b_coded = bs_read1( s );
+
+                if( b_coded )
+                {
+                    mb->block[i].i_intra4x4_pred_mode = x264_mb_predict_intra4x4_mode( h, mb, i );
+                }
+                else
+                {
+                    int i_predicted_mode = x264_mb_predict_intra4x4_mode( h, mb, i );
+                    int i_mode = bs_read( s, 3 );
+
+                    if( i_mode >= i_predicted_mode )
+                    {
+                        mb->block[i].i_intra4x4_pred_mode = i_mode + 1;
+                    }
+                    else
+                    {
+                        mb->block[i].i_intra4x4_pred_mode = i_mode;
+                    }
+                }
+            }
+        }
+
+        mb->i_chroma_pred_mode = bs_read_ue( s );
+    }
+    else if( mb->i_type == P_8x8 || mb->i_type == B_8x8)
+    {
+        /* FIXME won't work for B_8x8 */
+
+        for( i = 0; i < 4; i++ )
+        {
+            int i_sub_partition;
+
+            i_sub_partition = bs_read_ue( s );
+            switch( i_sub_partition )
+            {
+                case 0:
+                    mb->i_sub_partition[i] = D_L0_8x8;
+                    break;
+                case 1:
+                    mb->i_sub_partition[i] = D_L0_8x4;
+                    break;
+                case 2:
+                    mb->i_sub_partition[i] = D_L0_4x8;
+                    break;
+                case 3:
+                    mb->i_sub_partition[i] = D_L0_4x4;
+                    break;
+                default:
+                    fprintf( stderr, "invalid i_sub_partition\n" );
+                    return -1;
+            }
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int i_ref;
+
+            i_ref = b_sub_ref0 ? 0 : bs_read_te( s, h->sh.i_num_ref_idx_l0_active - 1 );
+            x264_mb_partition_ref_set( mb, 0, i, i_ref );
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int i_sub;
+            int i_ref;
+
+            x264_mb_partition_get( mb, 0, i, 0, &i_ref, NULL, NULL );
+
+            for( i_sub = 0; i_sub < x264_mb_partition_count_table[mb->i_sub_partition[i]]; i_sub++ )
+            {
+                int mv[2];
+
+                x264_mb_predict_mv( mb, 0, i, i_sub, mv );
+                mv[0] += bs_read_se( s );
+                mv[1] += bs_read_se( s );
+
+                x264_mb_partition_mv_set( mb, 0, i, i_sub, mv );
+            }
+        }
+    }
+    else if( mb->i_type != B_DIRECT )
+    {
+        /* FIXME will work only for P block */
+
+        /* FIXME using x264_mb_partition_set/x264_mb_partition_get here are too unoptimised
+         * I should introduce ref and mv get/set */
+
+        /* Motion Vector */
+        int i_part = x264_mb_partition_count_table[mb->i_partition];
+
+        for( i = 0; i < i_part; i++ )
+        {
+            int i_ref;
+
+            i_ref = bs_read_te( s, h->sh.i_num_ref_idx_l0_active - 1 );
+
+            x264_mb_partition_ref_set( mb, 0, i, i_ref );
+        }
+
+        for( i = 0; i < i_part; i++ )
+        {
+            int mv[2];
+
+            x264_mb_predict_mv( mb, 0, i, 0, mv );
+
+            mv[0] += bs_read_se( s );
+            mv[1] += bs_read_se( s );
+
+            x264_mb_partition_mv_set( mb, 0, i, 0, mv );
+        }
+    }
+
+    if( mb->i_type != I_16x16 )
+    {
+        int i_cbp;
+
+        i_cbp = bs_read_ue( s );
+        if( i_cbp >= 48 )
+        {
+            fprintf( stderr, "invalid cbp\n" );
+            return -1;
+        }
+
+        if( mb->i_type == I_4x4 )
+        {
+            i_cbp = golomb_to_intra4x4_cbp[i_cbp];
+        }
+        else
+        {
+            i_cbp = golomb_to_inter_cbp[i_cbp];
+        }
+        mb->i_cbp_luma   = i_cbp&0x0f;
+        mb->i_cbp_chroma = i_cbp >> 4;
+    }
+
+    if( mb->i_cbp_luma > 0 || mb->i_cbp_chroma > 0 || mb->i_type == I_16x16 )
+    {
+        mb->i_qp = bs_read_se( s ) + h->pps->i_pic_init_qp + h->sh.i_qp_delta;
+
+        /* write residual */
+        if( mb->i_type == I_16x16 )
+        {
+            /* DC Luma */
+            if( block_residual_read_cavlc( h, s, mb, BLOCK_INDEX_LUMA_DC , mb->luma16x16_dc, 16 ) < 0 )
+            {
+                return -1;
+            }
+
+            if( mb->i_cbp_luma != 0 )
+            {
+                /* AC Luma */
+                for( i = 0; i < 16; i++ )
+                {
+                    if( block_residual_read_cavlc( h, s, mb, i, mb->block[i].residual_ac, 15 ) < 0 )
+                    {
+                        return -1;
+                    }
+                }
+            }
+            else
+            {
+                for( i = 0; i < 16; i++ )
+                {
+                    mb->block[i].i_non_zero_count = 0;
+                    array_zero_set( mb->block[i].residual_ac, 15 );
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                if( mb->i_cbp_luma & ( 1 << ( i / 4 ) ) )
+                {
+                    if( block_residual_read_cavlc( h, s, mb, i, mb->block[i].luma4x4, 16 ) < 0 )
+                    {
+                        return -1;
+                    }
+                }
+                else
+                {
+                    mb->block[i].i_non_zero_count = 0;
+                    array_zero_set( mb->block[i].luma4x4, 16 );
+                }
+            }
+        }
+
+        if( mb->i_cbp_chroma &0x03 )    /* Chroma DC residual present */
+        {
+            if( block_residual_read_cavlc( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[0], 4 ) < 0 ||
+                block_residual_read_cavlc( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[1], 4 ) < 0 )
+            {
+                return -1;
+            }
+        }
+        else
+        {
+            array_zero_set( mb->chroma_dc[0], 4 );
+            array_zero_set( mb->chroma_dc[1], 4 );
+        }
+        if( mb->i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                if( block_residual_read_cavlc( h, s, mb, 16 + i, mb->block[16+i].residual_ac, 15 ) < 0 )
+                {
+                    return -1;
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                mb->block[16+i].i_non_zero_count = 0;
+                array_zero_set( mb->block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+    else
+    {
+        mb->i_qp = h->pps->i_pic_init_qp + h->sh.i_qp_delta;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = 0;
+            array_zero_set( mb->block[i].luma4x4, 16 );
+        }
+        array_zero_set( mb->chroma_dc[0], 4 );
+        array_zero_set( mb->chroma_dc[1], 4 );
+        for( i = 0; i < 8; i++ )
+        {
+            array_zero_set( mb->block[16+i].residual_ac, 15 );
+            mb->block[16+i].i_non_zero_count = 0;
+        }
+    }
+
+    //fprintf( stderr, "mb read type=%d\n", mb->i_type );
+
+    return 0;
+}
+
+
+
+
+static int x264_mb_pred_mode16x16_valid( x264_macroblock_t *mb, int i_mode )
+{
+    if( ( mb->i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        return i_mode;
+    }
+    else if( ( mb->i_neighbour & MB_LEFT ) )
+    {
+        if( i_mode == I_PRED_16x16_DC )
+        {
+            return I_PRED_16x16_DC_LEFT;
+        }
+        else if( i_mode == I_PRED_16x16_H )
+        {
+            return I_PRED_16x16_H;
+        }
+
+        fprintf( stderr, "invalid 16x16 prediction\n" );
+        return I_PRED_16x16_DC_LEFT;
+    }
+    else if( ( mb->i_neighbour & MB_TOP ) )
+    {
+        if( i_mode == I_PRED_16x16_DC )
+        {
+            return I_PRED_16x16_DC_TOP;
+        }
+        else if( i_mode == I_PRED_16x16_V )
+        {
+            return I_PRED_16x16_V;
+        }
+
+        fprintf( stderr, "invalid 16x16 prediction\n" );
+        return I_PRED_16x16_DC_TOP;
+    }
+    else
+    {
+        return I_PRED_16x16_DC_128;
+    }
+}
+
+static int x264_mb_pred_mode8x8_valid( x264_macroblock_t *mb, int i_mode )
+{
+    if( ( mb->i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        return i_mode;
+    }
+    else if( ( mb->i_neighbour & MB_LEFT ) )
+    {
+        if( i_mode == I_PRED_CHROMA_DC )
+        {
+            return I_PRED_CHROMA_DC_LEFT;
+        }
+        else if( i_mode == I_PRED_CHROMA_H )
+        {
+            return I_PRED_CHROMA_H;
+        }
+
+        fprintf( stderr, "invalid 8x8 prediction\n" );
+        return I_PRED_CHROMA_DC_LEFT;
+    }
+    else if( ( mb->i_neighbour & MB_TOP ) )
+    {
+        if( i_mode == I_PRED_CHROMA_DC )
+        {
+            return I_PRED_CHROMA_DC_TOP;
+        }
+        else if( i_mode == I_PRED_CHROMA_V )
+        {
+            return I_PRED_CHROMA_V;
+        }
+
+        fprintf( stderr, "invalid 8x8 prediction\n" );
+        return I_PRED_CHROMA_DC_TOP;
+    }
+    else
+    {
+        return I_PRED_CHROMA_DC_128;
+    }
+}
+
+static int x264_mb_pred_mode4x4_valid( x264_macroblock_t *mb, int idx, int i_mode, int *pb_emu )
+{
+    int b_a, b_b, b_c;
+    static const int needmb[16] =
+    {
+        MB_LEFT|MB_TOP, MB_TOP,
+        MB_LEFT,        MB_PRIVATE,
+        MB_TOP,         MB_TOP|MB_TOPRIGHT,
+        0,              MB_PRIVATE,
+        MB_LEFT,        0,
+        MB_LEFT,        MB_PRIVATE,
+        0,              MB_PRIVATE,
+        0,              MB_PRIVATE
+    };
+    int b_emu = 0;
+
+    *pb_emu = 0;
+
+    b_a = (needmb[idx]&mb->i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
+    b_b = (needmb[idx]&mb->i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
+    b_c = (needmb[idx]&mb->i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
+
+    if( b_c == 0 && b_b )
+    {
+        b_emu = 1;
+        b_c = 1;
+    }
+
+    /* handle I_PRED_4x4_DC */
+    if( i_mode == I_PRED_4x4_DC )
+    {
+        if( b_a && b_b )
+        {
+            return I_PRED_4x4_DC;
+        }
+        else if( b_a )
+        {
+            return I_PRED_4x4_DC_LEFT;
+        }
+        else if( b_b )
+        {
+            return I_PRED_4x4_DC_TOP;
+        }
+        return I_PRED_4x4_DC_128;
+    }
+
+    /* handle 1 dir needed only */
+    if( ( b_a && i_mode == I_PRED_4x4_H ) ||
+        ( b_b && i_mode == I_PRED_4x4_V ) )
+    {
+        return i_mode;
+    }
+
+    /* handle b_c case (b_b always true) */
+    if( b_c && ( i_mode == I_PRED_4x4_DDL || i_mode == I_PRED_4x4_VL ) )
+    {
+        *pb_emu = b_emu;
+        return i_mode;
+    }
+
+    if( b_a && b_b )
+    {
+        /* I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU */
+        return i_mode;
+    }
+
+    fprintf( stderr, "invalid 4x4 predict mode(%d, mb:%x-%x idx:%d\n", i_mode, mb->i_mb_x, mb->i_mb_y, idx );
+    return I_PRED_CHROMA_DC_128;    /* unefficient */
+}
+
+/****************************************************************************
+ * UnScan functions
+ ****************************************************************************/
+static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void unscan_zigzag_4x4full( int16_t dct[4][4], int level[16] )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dct[scan_zigzag_y[i]][scan_zigzag_x[i]] = level[i];
+    }
+}
+static inline void unscan_zigzag_4x4( int16_t dct[4][4], int level[15] )
+{
+    int i;
+
+    for( i = 1; i < 16; i++ )
+    {
+        dct[scan_zigzag_y[i]][scan_zigzag_x[i]] = level[i - 1];
+    }
+}
+
+static inline void unscan_zigzag_2x2_dc( int16_t dct[2][2], int level[4] )
+{
+    dct[0][0] = level[0];
+    dct[0][1] = level[1];
+    dct[1][0] = level[2];
+    dct[1][1] = level[3];
+}
+
+
+int x264_macroblock_decode( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    int i_qscale;
+    int ch;
+    int i;
+
+    if( !IS_INTRA(mb->i_type ) )
+    {
+        /* Motion compensation */
+        x264_mb_mc( h, mb );
+    }
+
+    /* luma */
+    i_qscale = mb->i_qp;
+    if( mb->i_type == I_16x16 )
+    {
+        int     i_mode = x264_mb_pred_mode16x16_valid( mb, mb->i_intra16x16_pred_mode );
+        int16_t luma[16][4][4];
+        int16_t dct4x4[16+1][4][4];
+
+
+        /* do the right prediction */
+        h->predict_16x16[i_mode]( ctx->p_fdec[0], ctx->i_fdec[0] );
+
+        /* get dc coeffs */
+        unscan_zigzag_4x4full( dct4x4[0], mb->luma16x16_dc );
+        h->dctf.idct4x4dc( dct4x4[0], dct4x4[0] );
+        x264_mb_dequant_4x4_dc( dct4x4[0], i_qscale );
+
+        /* decode the 16x16 macroblock */
+        for( i = 0; i < 16; i++ )
+        {
+            unscan_zigzag_4x4( dct4x4[1+i], mb->block[i].residual_ac );
+            x264_mb_dequant_4x4( dct4x4[1+i], i_qscale );
+
+            /* copy dc coeff */
+            dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+
+            h->dctf.idct4x4( luma[i], dct4x4[i+1] );
+        }
+        /* put pixels to fdec */
+        h->pixf.add16x16( ctx->p_fdec[0], ctx->i_fdec[0], luma );
+    }
+    else if( mb->i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            int16_t luma[4][4];
+            int16_t dct4x4[4][4];
+
+            uint8_t *p_dst_by;
+            int     i_mode;
+            int     b_emu;
+
+            /* Do the right prediction */
+            p_dst_by = ctx->p_fdec[0] + 4 * block_idx_x[i] + 4 * block_idx_y[i] * ctx->i_fdec[0];
+            i_mode   = x264_mb_pred_mode4x4_valid( mb, i, mb->block[i].i_intra4x4_pred_mode, &b_emu );
+            if( b_emu )
+            {
+                fprintf( stderr, "mmmh b_emu\n" );
+                memset( &p_dst_by[4], p_dst_by[3], 4 );
+            }
+            h->predict_4x4[i_mode]( p_dst_by, ctx->i_fdec[0] );
+
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                /* decode one 4x4 block */
+                unscan_zigzag_4x4full( dct4x4, mb->block[i].luma4x4 );
+
+                x264_mb_dequant_4x4( dct4x4, i_qscale );
+
+                h->dctf.idct4x4( luma, dct4x4 );
+
+                h->pixf.add4x4( p_dst_by, ctx->i_fdec[0], luma );
+            }
+        }
+    }
+    else /* Inter mb */
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            uint8_t *p_dst_by;
+            int16_t luma[4][4];
+            int16_t dct4x4[4][4];
+
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                unscan_zigzag_4x4full( dct4x4, mb->block[i].luma4x4 );
+                x264_mb_dequant_4x4( dct4x4, i_qscale );
+
+                h->dctf.idct4x4( luma, dct4x4 );
+
+                p_dst_by = ctx->p_fdec[0] + 4 * block_idx_x[i] + 4 * block_idx_y[i] * ctx->i_fdec[0];
+                h->pixf.add4x4( p_dst_by, ctx->i_fdec[0], luma );
+            }
+        }
+    }
+
+    /* chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( mb->i_type ) )
+    {
+        int i_mode = x264_mb_pred_mode8x8_valid( mb, mb->i_chroma_pred_mode );
+        /* do the right prediction */
+        h->predict_8x8[i_mode]( ctx->p_fdec[1], ctx->i_fdec[1] );
+        h->predict_8x8[i_mode]( ctx->p_fdec[2], ctx->i_fdec[2] );
+    }
+
+    if( mb->i_cbp_chroma != 0 )
+    {
+        for( ch = 0; ch < 2; ch++ )
+        {
+            int16_t chroma[4][4][4];
+            int16_t dct2x2[2][2];
+            int16_t dct4x4[4][4][4];
+
+            /* get dc chroma */
+            unscan_zigzag_2x2_dc( dct2x2, mb->chroma_dc[ch] );
+            h->dctf.idct2x2dc( dct2x2, dct2x2 );
+            x264_mb_dequant_2x2_dc( dct2x2, i_qscale );
+
+            for( i = 0; i < 4; i++ )
+            {
+                unscan_zigzag_4x4( dct4x4[i], mb->block[16+i+ch*4].residual_ac );
+                x264_mb_dequant_4x4( dct4x4[i], i_qscale );
+
+                /* copy dc coeff */
+                dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+
+                h->dctf.idct4x4( chroma[i], dct4x4[i] );
+            }
+            h->pixf.add8x8( ctx->p_fdec[1+ch], ctx->i_fdec[1+ch], chroma );
+        }
+    }
+
+    return 0;
+}
+
+void x264_macroblock_decode_skip( x264_t *h, x264_macroblock_t *mb )
+{
+    int i;
+    int x, y;
+    int mv[2];
+
+    /* decode it as a 16x16 with no luma/chroma */
+    mb->i_type = P_L0;
+    mb->i_partition = D_16x16;
+    mb->i_cbp_luma = 0;
+    mb->i_cbp_chroma = 0;
+    for( i = 0; i < 16 + 8; i++ )
+    {
+        mb->block[i].i_non_zero_count = 0;
+    }
+    for( i = 0; i < 16; i++ )
+    {
+        array_zero_set( mb->block[i].luma4x4, 16 );
+    }
+    array_zero_set( mb->chroma_dc[0], 4 );
+    array_zero_set( mb->chroma_dc[1], 4 );
+    for( i = 0; i < 8; i++ )
+    {
+        array_zero_set( mb->block[16+i].residual_ac, 15 );
+    }
+
+    /* set ref0 */
+    for( x = 0; x < 4; x++ )
+    {
+        for( y = 0; y < 4; y++ )
+        {
+            mb->partition[x][y].i_ref[0] = 0;
+        }
+    }
+    /* get mv */
+    x264_mb_predict_mv_pskip( mb, mv );
+
+    x264_mb_partition_mv_set( mb, 0, 0, 0, mv );
+
+    /* Motion compensation */
+    x264_mb_mc( h, mb );
+
+    mb->i_type = P_SKIP;
+}
+
diff --git a/decoder/macroblock.h b/decoder/macroblock.h
new file mode 100644
index 00000000..96b5d2eb
--- /dev/null
+++ b/decoder/macroblock.h
@@ -0,0 +1,34 @@
+/*****************************************************************************
+ * macroblock.h: h264 decoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_MACROBLOCK_H
+#define _DECODER_MACROBLOCK_H 1
+
+int  x264_macroblock_read_cabac( x264_t *h, bs_t *s, x264_macroblock_t *mb );
+int  x264_macroblock_read_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb );
+
+int  x264_macroblock_decode( x264_t *h, x264_macroblock_t *mb );
+void x264_macroblock_decode_skip( x264_t *h, x264_macroblock_t *mb );
+
+#endif
+
diff --git a/decoder/set.c b/decoder/set.c
new file mode 100644
index 00000000..fb489970
--- /dev/null
+++ b/decoder/set.c
@@ -0,0 +1,262 @@
+/*****************************************************************************
+ * x264: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "set.h"
+
+/* return -1 if invalid, else the id */
+int x264_sps_read( bs_t *s, x264_sps_t sps_array[32] )
+{
+    x264_sps_t *sps;
+
+    int i_profile_idc;
+    int i_level_idc;
+
+    int b_constraint_set0;
+    int b_constraint_set1;
+    int b_constraint_set2;
+
+    int id;
+
+    i_profile_idc     = bs_read( s, 8 );
+    b_constraint_set0 = bs_read( s, 1 );
+    b_constraint_set1 = bs_read( s, 1 );
+    b_constraint_set2 = bs_read( s, 1 );
+
+    bs_skip( s, 5 );    /* reserved */
+    i_level_idc       = bs_read( s, 8 );
+
+
+    id = bs_read_ue( s );
+    if( bs_eof( s ) || id >= 32 )
+    {
+        /* the sps is invalid, no need to corrupt sps_array[0] */
+        return -1;
+    }
+
+    sps = &sps_array[id];
+    sps->i_id = id;
+
+    /* put pack parsed value */
+    sps->i_profile_idc     = i_profile_idc;
+    sps->i_level_idc       = i_level_idc;
+    sps->b_constraint_set0 = b_constraint_set0;
+    sps->b_constraint_set1 = b_constraint_set1;
+    sps->b_constraint_set2 = b_constraint_set2;
+
+    sps->i_log2_max_frame_num = bs_read_ue( s ) + 4;
+
+    sps->i_poc_type = bs_read_ue( s );
+    if( sps->i_poc_type == 0 )
+    {
+        sps->i_log2_max_poc_lsb = bs_read_ue( s ) + 4;
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+        sps->b_delta_pic_order_always_zero = bs_read( s, 1 );
+        sps->i_offset_for_non_ref_pic = bs_read_se( s );
+        sps->i_offset_for_top_to_bottom_field = bs_read_se( s );
+        sps->i_num_ref_frames_in_poc_cycle = bs_read_ue( s );
+        if( sps->i_num_ref_frames_in_poc_cycle > 256 )
+        {
+            /* FIXME what to do */
+            sps->i_num_ref_frames_in_poc_cycle = 256;
+        }
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            sps->i_offset_for_ref_frame[i] = bs_read_se( s );
+        }
+    }
+    else if( sps->i_poc_type > 2 )
+    {
+        goto error;
+    }
+
+    sps->i_num_ref_frames = bs_read_ue( s );
+    sps->b_gaps_in_frame_num_value_allowed = bs_read( s, 1 );
+    sps->i_mb_width = bs_read_ue( s ) + 1;
+    sps->i_mb_height= bs_read_ue( s ) + 1;
+    sps->b_frame_mbs_only = bs_read( s, 1 );
+    if( !sps->b_frame_mbs_only )
+    {
+        sps->b_mb_adaptive_frame_field = bs_read( s, 1 );
+    }
+    else
+    {
+        sps->b_mb_adaptive_frame_field = 0;
+    }
+    sps->b_direct8x8_inference = bs_read( s, 1 );
+
+    sps->b_crop = bs_read( s, 1 );
+    if( sps->b_crop )
+    {
+        sps->crop.i_left  = bs_read_ue( s );
+        sps->crop.i_right = bs_read_ue( s );
+        sps->crop.i_top   = bs_read_ue( s );
+        sps->crop.i_bottom= bs_read_ue( s );
+    }
+    else
+    {
+        sps->crop.i_left  = 0;
+        sps->crop.i_right = 0;
+        sps->crop.i_top   = 0;
+        sps->crop.i_bottom= 0;
+    }
+
+    sps->b_vui = bs_read( s, 1 );
+    if( sps->b_vui )
+    {
+        /* FIXME */
+    }
+    else
+    {
+
+    }
+
+    if( bs_eof( s ) )
+    {
+        /* no rbsp trailing */
+        fprintf( stderr, "incomplete SPS\n" );
+        goto error;
+    }
+
+    fprintf( stderr, "x264_sps_read: sps:0x%x profile:%d/%d poc:%d ref:%d %xx%d crop:%d-%d-%d-%d\n",
+             sps->i_id,
+             sps->i_profile_idc, sps->i_level_idc,
+             sps->i_poc_type,
+             sps->i_num_ref_frames,
+             sps->i_mb_width, sps->i_mb_height,
+             sps->crop.i_left, sps->crop.i_right,
+             sps->crop.i_top, sps->crop.i_bottom );
+
+    return id;
+
+error:
+    /* invalidate this sps */
+    sps->i_id = -1;
+    return -1;
+}
+
+/* return -1 if invalid, else the id */
+int x264_pps_read( bs_t *s, x264_pps_t pps_array[256] )
+{
+    x264_pps_t *pps;
+    int id;
+    int i;
+
+    id = bs_read_ue( s );
+    if( bs_eof( s ) || id >= 256 )
+    {
+        fprintf( stderr, "id invalid\n" );
+        return -1;
+    }
+    pps = &pps_array[id];
+    pps->i_id = id;
+    pps->i_sps_id = bs_read_ue( s );
+    if( pps->i_sps_id >= 32 )
+    {
+        goto error;
+    }
+    pps->b_cabac = bs_read( s, 1 );
+    pps->b_pic_order = bs_read( s, 1 );
+    pps->i_num_slice_groups = bs_read_ue( s ) + 1;
+    if( pps->i_num_slice_groups > 1 )
+    {
+        fprintf( stderr, "FMO unsupported\n " );
+
+        pps->i_slice_group_map_type  =bs_read_ue( s );
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_run_length[i] = bs_read_ue( s );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_top_left[i] = bs_read_ue( s );
+                pps->i_bottom_right[i] = bs_read_ue( s );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 3 ||
+                 pps->i_slice_group_map_type == 4 ||
+                 pps->i_slice_group_map_type == 5 )
+        {
+            pps->b_slice_group_change_direction = bs_read( s, 1 );
+            pps->i_slice_group_change_rate = bs_read_ue( s ) + 1;
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            pps->i_pic_size_in_map_units = bs_read_ue( s ) + 1;
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+               /*  FIXME */
+                /* pps->i_slice_group_id = bs_read( s, ceil( log2( pps->i_pic_size_in_map_units +1 ) ) ); */
+            }
+        }
+    }
+    pps->i_num_ref_idx_l0_active = bs_read_ue( s ) + 1;
+    pps->i_num_ref_idx_l1_active = bs_read_ue( s ) + 1;
+    pps->b_weighted_pred = bs_read( s, 1 );
+    pps->b_weighted_bipred = bs_read( s, 2 );
+
+    pps->i_pic_init_qp = bs_read_se( s ) + 26;
+    pps->i_pic_init_qs = bs_read_se( s ) + 26;
+
+    pps->i_chroma_qp_index_offset = bs_read_se( s );
+
+    pps->b_deblocking_filter_control = bs_read( s, 1 );
+    pps->b_constrained_intra_pred = bs_read( s, 1 );
+    pps->b_redundant_pic_cnt = bs_read( s, 1 );
+
+    if( bs_eof( s ) )
+    {
+        /* no rbsp trailing */
+        fprintf( stderr, "incomplete PPS\n" );
+        goto error;
+    }
+    fprintf( stderr, "x264_sps_read: pps:0x%x sps:0x%x %s slice_groups=%d ref0:%d ref1:%d QP:%d QS:%d QC=%d DFC:%d CIP:%d RPC:%d\n",
+             pps->i_id, pps->i_sps_id,
+             pps->b_cabac ? "CABAC" : "CAVLC",
+             pps->i_num_slice_groups,
+             pps->i_num_ref_idx_l0_active,
+             pps->i_num_ref_idx_l1_active,
+             pps->i_pic_init_qp, pps->i_pic_init_qs, pps->i_chroma_qp_index_offset,
+             pps->b_deblocking_filter_control,
+             pps->b_constrained_intra_pred,
+             pps->b_redundant_pic_cnt );
+
+    return id;
+error:
+    pps->i_id = -1;
+    return -1;
+}
+
diff --git a/decoder/set.h b/decoder/set.h
new file mode 100644
index 00000000..62719457
--- /dev/null
+++ b/decoder/set.h
@@ -0,0 +1,33 @@
+/*****************************************************************************
+ * set.h: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_SET_H
+#define _DECODER_SET_H 1
+
+/* return -1 if invalid, else the id */
+int x264_sps_read( bs_t *s, x264_sps_t sps_array[32] );
+
+/* return -1 if invalid, else the id */
+int x264_pps_read( bs_t *s, x264_pps_t pps_array[256] );
+
+#endif
diff --git a/decoder/vlc.c b/decoder/vlc.c
new file mode 100644
index 00000000..2109e8c6
--- /dev/null
+++ b/decoder/vlc.c
@@ -0,0 +1,236 @@
+/*****************************************************************************
+ * vlc.c: VLC lookup table generation.
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "vlc.h"
+
+
+static int  vlc_table_realloc( x264_vlc_table_t *table, int i_size )
+{
+    int i_index;
+
+    i_index = table->i_lookup;
+
+    table->i_lookup += i_size;
+    table->lookup = x264_realloc( table->lookup, sizeof( vlc_lookup_t ) * table->i_lookup );
+
+    return( i_index );
+}
+
+static int vlc_table_create_part( x264_vlc_table_t *table, const vlc_t *vlc, int i_lookup_bits, int i_nb_vlc, int i_prefix_code, int i_prefix_length )
+{
+    int i;
+    int i_nb_lookup;
+    vlc_lookup_t *lookup;
+    int i_table_index;
+
+    i_nb_lookup = 1 << i_lookup_bits;
+
+    i_table_index = vlc_table_realloc( table, i_nb_lookup );
+    lookup = &table->lookup[i_table_index];
+
+    for( i = 0; i < i_nb_lookup; i++ )
+    {
+        lookup[i].i_value  = -1;
+        lookup[i].i_size = 0;
+    }
+
+    for( i = 0; i < i_nb_vlc; i++ )
+    {
+        int i_bits;
+        if( vlc[i].i_size <= 0 )
+        {
+            continue;
+        }
+
+        i_bits = vlc[i].i_size - i_prefix_length;
+        if( i_bits > 0 && ( vlc[i].i_bits >> i_bits ) == i_prefix_code )
+        {
+            if( i_bits <= i_lookup_bits )
+            {
+                int i_lookup_index;
+                int nb;
+
+                i_lookup_index = ( vlc[i].i_bits << ( i_lookup_bits - i_bits ) )%i_nb_lookup;
+                nb = 1 << ( i_lookup_bits - i_bits );
+                for( nb = 0; nb < (1 << ( i_lookup_bits - i_bits)); nb++ )
+                {
+                    lookup[i_lookup_index].i_value = i; /* vlc[i].i_value; */
+                    lookup[i_lookup_index].i_size = i_bits;
+                    i_lookup_index++;
+                }
+            }
+            else
+            {
+                int i_bits_max;
+                int i_lookup_index;
+                /* need another table */
+                i_lookup_index = ( vlc[i].i_bits >> (i_bits - i_lookup_bits ) )%i_nb_lookup;
+
+                i_bits_max =  -lookup[i_lookup_index].i_size;
+                if( i_bits_max < i_bits - i_lookup_bits )
+                {
+                    i_bits_max = i_bits - i_lookup_bits;
+                }
+                lookup[i_lookup_index].i_size = -i_bits_max;
+            }
+        }
+    }
+
+    /* create other level table */
+    for( i = 0; i < i_nb_lookup; i++ )
+    {
+        if( lookup[i].i_size < 0 )
+        {
+            int i_bits;
+            int i_index;
+            i_bits = -lookup[i].i_size;
+            if( i_bits > i_lookup_bits )
+            {
+                lookup[i].i_size = -i_lookup_bits;
+                i_bits = i_lookup_bits;
+            }
+
+            i_index = vlc_table_create_part( table, vlc, i_bits, i_nb_vlc,
+                                             (i_prefix_code << i_lookup_bits)|i,
+                                              i_lookup_bits+i_prefix_length );
+            lookup = &table->lookup[i_table_index]; // reallocated
+            lookup[i].i_value = i_index;
+        }
+    }
+
+    return( i_table_index );
+}
+
+
+x264_vlc_table_t *x264_vlc_table_lookup_new( const vlc_t *vlc, int i_vlc, int i_lookup_bits )
+{
+    x264_vlc_table_t *table = x264_malloc( sizeof( x264_vlc_table_t ) );
+
+    table->i_lookup_bits = i_lookup_bits;
+    table->i_lookup = 0;
+    table->lookup   = NULL;
+
+    vlc_table_create_part( table, vlc, i_lookup_bits, i_vlc, 0, 0 );
+
+    return table;
+}
+
+void x264_vlc_table_lookup_delete( x264_vlc_table_t *table )
+{
+    x264_free( table->lookup );
+    x264_free( table );
+}
+
+#if 0
+void x264_vlc_table_lookup_print( x264_vlc_table_t *table )
+{
+    int idx;
+
+    fprintf( stderr, "       " );
+    for( idx = 0; idx < table->i_lookup; idx++ )
+    {
+        if( table->lookup[idx].i_value == -1 )
+        {
+            fprintf( stderr, " MKVLCLU(    -1,  0 )," );
+        }
+        else
+        {
+            fprintf( stderr, " MKVLCLU( 0x%.3x, % 2d ),", table->lookup[idx].i_value, table->lookup[idx].i_size );
+        }
+        if( (idx+1)%4 == 0 && idx < table->i_lookup - 1)
+        {
+            fprintf( stderr, "\n       " );
+        }
+    }
+    fprintf( stderr, "\n" );
+}
+
+int main(void)
+{
+    int i;
+    x264_vlc_table_t *table;
+
+
+    printf( "typedef struct\n    int i_value;\n    int i_size;\n} vlc_lookup_t;\n\n#define MKVLCLU(a,b) { .i_value=a, .i_size=b}" );
+
+    /* create vlc  entry table and then vlc_lookup_t table */
+
+    /* x264_coeff_token */
+    fprintf( stderr, "static const vlc_lookup_t x264_coeff_token_lookup[5][]=\n{\n" );
+    for( i = 0; i < 5; i++ )
+    {
+        fprintf( stderr, "    {\n" );
+        table = x264_vlc_table_lookup_new( x264_coeff_token[i], 17*4, 6 );
+        x264_vlc_table_lookup_print( table );
+        x264_vlc_table_lookup_delete( table );
+        fprintf( stderr, "    },\n" );
+    }
+    fprintf( stderr, "};\n" );
+
+#if 0
+
+    vlce = convert_vlc_to_vlce( x264_level_prefix, 16 );
+    do_vlc_table_create( vlce, 16, "x264_level_prefix_lookup", 8 );
+    free( vlce );
+
+    for( i_table = 0; i_table < 15; i_table++ )
+    {
+        char name[512];
+        vlce = convert_vlc_to_vlce( x264_total_zeros[i_table], 16 );
+        sprintf( name, "x264_total_zeros_%d", i_table );
+        do_vlc_table_create( vlce, 16, name, 6 );
+
+        free( vlce );
+    }
+
+    for( i_table = 0; i_table < 3; i_table++ )
+    {
+        char name[512];
+
+        vlce = convert_vlc_to_vlce( x264_total_zeros_dc[i_table], 4 );
+        sprintf( name, "x264_total_zeros_dc_%d", i_table );
+        do_vlc_table_create( vlce, 4, name, 3 );
+
+        free( vlce );
+    }
+
+    for( i_table = 0; i_table < 7; i_table++ )
+    {
+        char name[512];
+        vlce = convert_vlc_to_vlce( x264_run_before[i_table], 15 );
+        sprintf( name, "x264_run_before_%d", i_table );
+        do_vlc_table_create( vlce, 15, name, 6 );
+
+        free( vlce );
+    }
+#endif
+    return 0;
+}
+
+#endif
diff --git a/decoder/vlc.h b/decoder/vlc.h
new file mode 100644
index 00000000..9529349e
--- /dev/null
+++ b/decoder/vlc.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+ * vlc.h: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_VLC_H
+#define _DECODER_VLC_H 1
+
+typedef struct
+{
+    int i_value;
+    int i_size;
+} vlc_lookup_t;
+
+struct x264_vlc_table_t
+{
+    int          i_lookup_bits;
+
+    int          i_lookup;
+    vlc_lookup_t *lookup;
+};
+
+x264_vlc_table_t *x264_vlc_table_lookup_new( const vlc_t *vlc, int i_vlc, int i_lookup_bits );
+
+void x264_vlc_table_lookup_delete( x264_vlc_table_t *table );
+
+#endif
+
diff --git a/doc/dct.txt b/doc/dct.txt
new file mode 100644
index 00000000..eb0e64f0
--- /dev/null
+++ b/doc/dct.txt
@@ -0,0 +1,111 @@
+/****************************************************************************
+ * DCT/IDCT functions
+ ****************************************************************************/
+/* be carefull that "dct" could be equal to "luma" (ie dct_4x4(dct,dct) )*/
+static void dct_2x2_dc( int16_t dct[2][2], int16_t chroma[2][2] )
+{
+    int tmp[2][2];
+
+    tmp[0][0] = chroma[0][0] + chroma[0][1];
+    tmp[1][0] = chroma[0][0] - chroma[0][1];
+    tmp[0][1] = chroma[1][0] + chroma[1][1];
+    tmp[1][1] = chroma[1][0] - chroma[1][1];
+
+    dct[0][0] = tmp[0][0] + tmp[0][1];
+    dct[0][1] = tmp[1][0] + tmp[1][1];
+    dct[1][0] = tmp[0][0] - tmp[0][1];
+    dct[1][1] = tmp[1][0] - tmp[1][1];
+}
+
+static void idct_2x2_dc( int16_t dct[2][2], int16_t chroma[2][2] )
+{
+    dct_2x2_dc( chroma, dct );
+}
+
+static void dct_4x4_dc( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = luma[i][0] + luma[i][1] + luma[i][2] + luma[i][3];
+        tmp[1][i] = luma[i][0] + luma[i][1] - luma[i][2] - luma[i][3];
+        tmp[2][i] = luma[i][0] - luma[i][1] - luma[i][2] + luma[i][3];
+        tmp[3][i] = luma[i][0] - luma[i][1] + luma[i][2] - luma[i][3];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        dct[0][i] = ( tmp[i][0] + tmp[i][1] + tmp[i][2] + tmp[i][3] + 1) / 2;
+        dct[1][i] = ( tmp[i][0] + tmp[i][1] - tmp[i][2] - tmp[i][3] + 1) / 2;
+        dct[2][i] = ( tmp[i][0] - tmp[i][1] - tmp[i][2] + tmp[i][3] + 1) / 2;
+        dct[3][i] = ( tmp[i][0] - tmp[i][1] + tmp[i][2] - tmp[i][3] + 1) / 2;
+    }
+}
+
+static void dct_4x4( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] =   luma[i][0] +   luma[i][1] +   luma[i][2] +   luma[i][3];
+        tmp[1][i] = 2*luma[i][0] +   luma[i][1] -   luma[i][2] - 2*luma[i][3];
+        tmp[2][i] =   luma[i][0] -   luma[i][1] -   luma[i][2] +   luma[i][3];
+        tmp[3][i] =   luma[i][0] - 2*luma[i][1] + 2*luma[i][2] -   luma[i][3];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        dct[0][i] =   tmp[i][0] +   tmp[i][1] +   tmp[i][2] +   tmp[i][3];
+        dct[1][i] = 2*tmp[i][0] +   tmp[i][1] -   tmp[i][2] - 2*tmp[i][3];
+        dct[2][i] =   tmp[i][0] -   tmp[i][1] -   tmp[i][2] +   tmp[i][3];
+        dct[3][i] =   tmp[i][0] - 2*tmp[i][1] + 2*tmp[i][2] -   tmp[i][3];
+    }
+}
+
+static void idct_4x4( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = dct[0][i]   +  dct[1][i]     + dct[2][i]   + (dct[3][i]>>1);
+        tmp[1][i] = dct[0][i]   + (dct[1][i]>>1) - dct[2][i]   -  dct[3][i];
+        tmp[2][i] = dct[0][i]   - (dct[1][i]>>1) - dct[2][i]   +  dct[3][i];
+        tmp[3][i] = dct[0][i]   -  dct[1][i]     + dct[2][i]   - (dct[3][i]>>1);
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        /* XXX XXX use >> 6 and not / 64 -> doesn't give the same result :((((((((( */
+        luma[i][0] = ( tmp[i][0] +  tmp[i][1]     + tmp[i][2] + (tmp[i][3]>>1) + 32 ) >> 6;
+        luma[i][1] = ( tmp[i][0] + (tmp[i][1]>>1) - tmp[i][2] -  tmp[i][3]     + 32 ) >> 6;
+        luma[i][2] = ( tmp[i][0] - (tmp[i][1]>>1) - tmp[i][2] +  tmp[i][3]     + 32 ) >> 6;
+        luma[i][3] = ( tmp[i][0] -  tmp[i][1]     + tmp[i][2] - (tmp[i][3]>>1) + 32 ) >> 6;
+    }
+}
+static void idct_4x4_dc( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = dct[0][i] + dct[1][i] + dct[2][i] + dct[3][i];
+        tmp[1][i] = dct[0][i] + dct[1][i] - dct[2][i] - dct[3][i];
+        tmp[2][i] = dct[0][i] - dct[1][i] - dct[2][i] + dct[3][i];
+        tmp[3][i] = dct[0][i] - dct[1][i] + dct[2][i] - dct[3][i];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        luma[i][0] = ( tmp[i][0] + tmp[i][1] + tmp[i][2] + tmp[i][3] ) ;
+        luma[i][1] = ( tmp[i][0] + tmp[i][1] - tmp[i][2] - tmp[i][3] ) ;
+        luma[i][2] = ( tmp[i][0] - tmp[i][1] - tmp[i][2] + tmp[i][3] ) ;
+        luma[i][3] = ( tmp[i][0] - tmp[i][1] + tmp[i][2] - tmp[i][3] ) ;
+    }
+}
diff --git a/encoder/analyse.c b/encoder/analyse.c
new file mode 100644
index 00000000..77e2c316
--- /dev/null
+++ b/encoder/analyse.c
@@ -0,0 +1,1224 @@
+/*****************************************************************************
+ * analyse.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "../core/common.h"
+#include "../core/macroblock.h"
+#include "macroblock.h"
+#include "me.h"
+
+typedef struct
+{
+    /* 16x16 */
+    int i_ref;
+    x264_me_t me16x16;
+
+    /* 8x8 */
+    int       i_cost8x8;
+    x264_me_t me8x8[4];
+
+    /* Sub 4x4 */
+    int       i_cost4x4[4]; /* cost per 8x8 partition */
+    x264_me_t me4x4[4][4];
+
+    /* Sub 8x4 */
+    int       i_cost8x4[4]; /* cost per 8x8 partition */
+    x264_me_t me8x4[4][2];
+
+    /* Sub 4x8 */
+    int       i_cost4x8[4]; /* cost per 8x8 partition */
+    x264_me_t me4x8[4][4];
+
+    /* 16x8 */
+    int       i_cost16x8;
+    x264_me_t me16x8[2];
+
+    /* 8x16 */
+    int       i_cost8x16;
+    x264_me_t me8x16[2];
+
+} x264_mb_analysis_list_t;
+
+typedef struct
+{
+    /* conduct the analysis using this lamda and QP */
+    int i_lambda;
+    int i_qp;
+
+
+    /* I: Intra part */
+    /* Luma part 16x16 and 4x4 modes stats */
+    int i_sad_i16x16;
+    int i_predict16x16;
+
+    int i_sad_i4x4;
+    int i_predict4x4[4][4];
+
+    /* Chroma part */
+    int i_sad_i8x8;
+    int i_predict8x8;
+
+    /* II: Inter part P/B frame */
+    int i_mv_range;
+
+    x264_mb_analysis_list_t l0;
+    x264_mb_analysis_list_t l1;
+
+    int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
+
+} x264_mb_analysis_t;
+
+static const int i_qp0_cost_table[52] = {
+   1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
+   1, 1, 1, 1,              /*  8-11 */
+   1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
+   3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
+   6, 7, 8, 9,10,11,13,14,  /* 28-35 */
+  16,18,20,23,25,29,32,36,  /* 36-43 */
+  40,45,51,57,64,72,81,91   /* 44-51 */
+};
+
+static const uint8_t block_idx_x[16] = {
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] = {
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+
+static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
+{
+    memset( a, 0, sizeof( x264_mb_analysis_t ) );
+
+    /* conduct the analysis using this lamda and QP */
+    a->i_qp = i_qp;
+    a->i_lambda = i_qp0_cost_table[i_qp];
+
+    /* I: Intra part */
+    a->i_sad_i16x16 = -1;
+    a->i_sad_i4x4   = -1;
+    a->i_sad_i8x8   = -1;
+
+    /* II: Inter part P/B frame */
+    if( h->sh.i_type != SLICE_TYPE_I )
+    {
+        int dmb;
+        int i;
+
+        /* Calculate max start MV range */
+        dmb = h->mb.i_mb_x;
+        if( h->mb.i_mb_y < dmb )
+            dmb = h->mb.i_mb_y;
+        if( h->sps->i_mb_width - h->mb.i_mb_x < dmb )
+            dmb = h->sps->i_mb_width - h->mb.i_mb_x;
+        if( h->sps->i_mb_height - h->mb.i_mb_y < dmb )
+            dmb = h->sps->i_mb_height - h->mb.i_mb_y;
+
+        a->i_mv_range = 16*dmb + 8;
+
+        a->l0.me16x16.cost = -1;
+        a->l0.i_cost8x8    = -1;
+
+        for( i = 0; i < 4; i++ )
+        {
+            a->l0.i_cost4x4[i] = -1;
+            a->l0.i_cost8x4[i] = -1;
+            a->l0.i_cost4x8[i] = -1;
+        }
+
+        a->l0.i_cost16x8   = -1;
+        a->l0.i_cost8x16   = -1;
+        if( h->sh.i_type == SLICE_TYPE_B )
+        {
+            a->l1.me16x16.cost = -1;
+            a->l1.i_cost8x8    = -1;
+
+            for( i = 0; i < 4; i++ )
+            {
+                a->l1.i_cost4x4[i] = -1;
+                a->l1.i_cost8x4[i] = -1;
+                a->l1.i_cost4x8[i] = -1;
+            }
+
+            a->l1.i_cost16x8   = -1;
+            a->l1.i_cost8x16   = -1;
+
+            a->i_cost16x16bi   = -1;
+        }
+    }
+}
+
+
+
+/*
+ * Handle intra mb
+ */
+/* Max = 4 */
+static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+{
+    if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        /* top and left avaible */
+        *mode++ = I_PRED_16x16_V;
+        *mode++ = I_PRED_16x16_H;
+        *mode++ = I_PRED_16x16_DC;
+        *mode++ = I_PRED_16x16_P;
+        *pi_count = 4;
+    }
+    else if( ( i_neighbour & MB_LEFT ) )
+    {
+        /* left available*/
+        *mode++ = I_PRED_16x16_DC_LEFT;
+        *mode++ = I_PRED_16x16_H;
+        *pi_count = 2;
+    }
+    else if( ( i_neighbour & MB_TOP ) )
+    {
+        /* top available*/
+        *mode++ = I_PRED_16x16_DC_TOP;
+        *mode++ = I_PRED_16x16_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        /* none avaible */
+        *mode = I_PRED_16x16_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/* Max = 4 */
+static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+{
+    if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        /* top and left avaible */
+        *mode++ = I_PRED_CHROMA_V;
+        *mode++ = I_PRED_CHROMA_H;
+        *mode++ = I_PRED_CHROMA_DC;
+        *mode++ = I_PRED_CHROMA_P;
+        *pi_count = 4;
+    }
+    else if( ( i_neighbour & MB_LEFT ) )
+    {
+        /* left available*/
+        *mode++ = I_PRED_CHROMA_DC_LEFT;
+        *mode++ = I_PRED_CHROMA_H;
+        *pi_count = 2;
+    }
+    else if( ( i_neighbour & MB_TOP ) )
+    {
+        /* top available*/
+        *mode++ = I_PRED_CHROMA_DC_TOP;
+        *mode++ = I_PRED_CHROMA_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        /* none avaible */
+        *mode = I_PRED_CHROMA_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/* MAX = 8 */
+static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *mode, int *pi_count )
+{
+    int b_a, b_b, b_c;
+    static const unsigned int needmb[16] =
+    {
+        MB_LEFT|MB_TOP, MB_TOP,
+        MB_LEFT,        MB_PRIVATE,
+        MB_TOP,         MB_TOP|MB_TOPRIGHT,
+        0,              MB_PRIVATE,
+        MB_LEFT,        0,
+        MB_LEFT,        MB_PRIVATE,
+        0,              MB_PRIVATE,
+        0,              MB_PRIVATE
+    };
+
+    /* FIXME even when b_c == 0 there is some case where missing pixels
+     * are emulated and thus more mode are available TODO
+     * analysis and encode should be fixed too */
+    b_a = (needmb[idx]&i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
+    b_b = (needmb[idx]&i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
+    b_c = (needmb[idx]&i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
+
+    if( b_a && b_b )
+    {
+        *mode++ = I_PRED_4x4_DC;
+        *mode++ = I_PRED_4x4_H;
+        *mode++ = I_PRED_4x4_V;
+        *mode++ = I_PRED_4x4_DDR;
+        *mode++ = I_PRED_4x4_VR;
+        *mode++ = I_PRED_4x4_HD;
+        *mode++ = I_PRED_4x4_HU;
+
+        *pi_count = 7;
+
+        if( b_c )
+        {
+            *mode++ = I_PRED_4x4_DDL;
+            *mode++ = I_PRED_4x4_VL;
+            (*pi_count) += 2;
+        }
+    }
+    else if( b_a && !b_b )
+    {
+        *mode++ = I_PRED_4x4_DC_LEFT;
+        *mode++ = I_PRED_4x4_H;
+        *pi_count = 2;
+    }
+    else if( !b_a && b_b )
+    {
+        *mode++ = I_PRED_4x4_DC_TOP;
+        *mode++ = I_PRED_4x4_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        *mode++ = I_PRED_4x4_DC_128;
+        *pi_count = 1;
+    }
+}
+
+static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res )
+{
+    const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
+    const int i_stride = h->mb.pic.i_stride[0];
+    uint8_t  *p_src = h->mb.pic.p_fenc[0];
+    uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+
+    int i, idx;
+
+    int i_max;
+    int predict_mode[9];
+
+    /*---------------- Try all mode and calculate their score ---------------*/
+
+    /* 16x16 prediction selection */
+    predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_16x16[i_mode]( p_dst, i_stride );
+
+        /* we calculate the diff and get the square sum of the diff */
+        i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
+        /* if i_score is lower it is better */
+        if( res->i_sad_i16x16 == -1 || res->i_sad_i16x16 > i_sad )
+        {
+            res->i_predict16x16 = i_mode;
+            res->i_sad_i16x16     = i_sad;
+        }
+    }
+
+    /* 4x4 prediction selection */
+    if( flags & X264_ANALYSE_I4x4 )
+    {
+        res->i_sad_i4x4 = 0;
+        for( idx = 0; idx < 16; idx++ )
+        {
+            uint8_t *p_src_by;
+            uint8_t *p_dst_by;
+            int     i_best;
+            int x, y;
+            int i_pred_mode;
+
+            i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
+            x = block_idx_x[idx];
+            y = block_idx_y[idx];
+
+            p_src_by = p_src + 4 * x + 4 * y * i_stride;
+            p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
+
+            i_best = -1;
+            predict_4x4_mode_available( h->mb.i_neighbour, idx, predict_mode, &i_max );
+            for( i = 0; i < i_max; i++ )
+            {
+                int i_sad;
+                int i_mode;
+
+                i_mode = predict_mode[i];
+
+                /* we do the prediction */
+                h->predict_4x4[i_mode]( p_dst_by, i_stride );
+
+                /* we calculate diff and get the square sum of the diff */
+                i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_stride,
+                                                 p_src_by, i_stride );
+
+                i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
+
+                /* if i_score is lower it is better */
+                if( i_best == -1 || i_best > i_sad )
+                {
+                    res->i_predict4x4[x][y] = i_mode;
+                    i_best = i_sad;
+                }
+            }
+            res->i_sad_i4x4 += i_best;
+
+            /* we need to encode this mb now (for next ones) */
+            h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
+            x264_mb_encode_i4x4( h, idx, res->i_qp );
+
+            /* we need to store the 'fixed' version */
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] =
+                x264_mb_pred_mode4x4_fix[res->i_predict4x4[x][y]];
+        }
+        res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
+    }
+}
+
+static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
+{
+    int i;
+
+    int i_max;
+    int predict_mode[9];
+
+    uint8_t *p_dstc[2], *p_srcc[2];
+    int      i_stride[2];
+
+    /* 8x8 prediction selection for chroma */
+    p_dstc[0] = h->mb.pic.p_fdec[1];
+    p_dstc[1] = h->mb.pic.p_fdec[2];
+    p_srcc[0] = h->mb.pic.p_fenc[1];
+    p_srcc[1] = h->mb.pic.p_fenc[2];
+
+    i_stride[0] = h->mb.pic.i_stride[1];
+    i_stride[1] = h->mb.pic.i_stride[2];
+
+    predict_8x8_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+    res->i_sad_i8x8 = -1;
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_8x8[i_mode]( p_dstc[0], i_stride[0] );
+        h->predict_8x8[i_mode]( p_dstc[1], i_stride[1] );
+
+        /* we calculate the cost */
+        i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
+                                         p_srcc[0], i_stride[0] ) +
+                h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
+                                         p_srcc[1], i_stride[1] ) +
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix[i_mode] );
+
+        /* if i_score is lower it is better */
+        if( res->i_sad_i8x8 == -1 || res->i_sad_i8x8 > i_sad )
+        {
+            res->i_predict8x8 = i_mode;
+            res->i_sad_i8x8     = i_sad;
+        }
+    }
+}
+
+static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
+{
+    x264_me_t m;
+    int i_ref;
+
+    /* 16x16 Search on all ref frame */
+    m.i_pixel = PIXEL_16x16;
+    m.lm      = a->i_lambda;
+    m.p_fenc  = h->mb.pic.p_fenc[0];
+    m.i_stride= h->mb.pic.i_stride[0];
+    m.i_mv_range = a->i_mv_range;
+    m.b_mvc   = 0;
+//    m.mvc[0]  = 0;
+//    m.mvc[1]  = 0;
+
+    /* ME for ref 0 */
+    m.p_fref = h->mb.pic.p_fref[0][0][0];
+    x264_mb_predict_mv_16x16( h, 0, 0, m.mvp );
+    x264_me_search( h, &m );
+
+    a->l0.i_ref = 0;
+    a->l0.me16x16 = m;
+
+    for( i_ref = 1; i_ref < h->i_ref0; i_ref++ )
+    {
+        /* search with ref */
+        m.p_fref = h->mb.pic.p_fref[0][i_ref][0];
+        x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
+        x264_me_search( h, &m );
+
+        /* add ref cost */
+        m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+
+        if( m.cost < a->l0.me16x16.cost )
+        {
+            a->l0.i_ref = i_ref;
+            a->l0.me16x16 = m;
+        }
+    }
+
+    /* Set global ref, needed for all others modes */
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+}
+
+static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    for( i = 0; i < 4; i++ )
+    {
+        x264_me_t *m = &a->l0.me8x8[i];
+        const int x8 = i%2;
+        const int y8 = i/2;
+
+        m->i_pixel = PIXEL_8x8;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc = &p_fenc[8*(y8*h->mb.pic.i_stride[0]+x8)];
+        m->p_fref = &p_fref[8*(y8*h->mb.pic.i_stride[0]+x8)];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        if( i == 0 )
+        {
+            m->b_mvc   = 1;
+            m->mvc[0] = a->l0.me16x16.mv[0];
+            m->mvc[1] = a->l0.me16x16.mv[1];
+        }
+        else
+        {
+            m->b_mvc   = 0;
+        }
+
+        x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
+                   a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+}
+
+static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_16x8;
+
+    for( i = 0; i < 2; i++ )
+    {
+        x264_me_t *m = &a->l0.me16x8[i];
+
+        m->i_pixel = PIXEL_16x8;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc = &p_fenc[8*i*h->mb.pic.i_stride[0]];
+        m->p_fref = &p_fref[8*i*h->mb.pic.i_stride[0]];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        m->b_mvc   = 1;
+        m->mvc[0] = a->l0.me8x8[2*i].mv[0];
+        m->mvc[1] = a->l0.me8x8[2*i].mv[1];
+
+        x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
+}
+
+static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x16;
+
+    for( i = 0; i < 2; i++ )
+    {
+        x264_me_t *m = &a->l0.me8x16[i];
+
+        m->i_pixel = PIXEL_8x16;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc  = &p_fenc[8*i];
+        m->p_fref  = &p_fref[8*i];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        m->b_mvc   = 1;
+        m->mvc[0] = a->l0.me8x8[i].mv[0];
+        m->mvc[1] = a->l0.me8x8[i].mv[1];
+
+        x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
+}
+
+static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i4x4;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    for( i4x4 = 0; i4x4 < 4; i4x4++ )
+    {
+        const int idx = 4*i8x8 + i4x4;
+        const int x4 = block_idx_x[idx];
+        const int y4 = block_idx_y[idx];
+
+        x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
+
+        m->i_pixel = PIXEL_4x4;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc  = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->p_fref  = &p_fref[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        if( i4x4 == 0 )
+        {
+            m->b_mvc   = 1;
+            m->mvc[0] = a->l0.me8x8[i8x8].mv[0];
+            m->mvc[1] = a->l0.me8x8[i8x8].mv[1];
+        }
+        else
+        {
+            m->b_mvc   = 0;
+        }
+
+        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
+                         a->l0.me4x4[i8x8][1].cost +
+                         a->l0.me4x4[i8x8][2].cost +
+                         a->l0.me4x4[i8x8][3].cost;
+}
+
+static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i8x4;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    for( i8x4 = 0; i8x4 < 2; i8x4++ )
+    {
+        const int idx = 4*i8x8 + 2*i8x4;
+        const int x4 = block_idx_x[idx];
+        const int y4 = block_idx_y[idx];
+
+        x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
+
+        m->i_pixel = PIXEL_8x4;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc  = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->p_fref  = &p_fref[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        if( i8x4 == 0 )
+        {
+            m->b_mvc   = 1;
+            m->mvc[0] = a->l0.me4x4[i8x8][0].mv[0];
+            m->mvc[1] = a->l0.me4x4[i8x8][0].mv[1];
+        }
+        else
+        {
+            m->b_mvc   = 0;
+        }
+
+        x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost;
+}
+
+static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i4x8;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    for( i4x8 = 0; i4x8 < 2; i4x8++ )
+    {
+        const int idx = 4*i8x8 + i4x8;
+        const int x4 = block_idx_x[idx];
+        const int y4 = block_idx_y[idx];
+
+        x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
+
+        m->i_pixel = PIXEL_4x8;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc  = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->p_fref  = &p_fref[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        if( i4x8 == 0 )
+        {
+            m->b_mvc   = 1;
+            m->mvc[0] = a->l0.me4x4[i8x8][0].mv[0];
+            m->mvc[1] = a->l0.me4x4[i8x8][0].mv[1];
+        }
+        else
+        {
+            m->b_mvc   = 0;
+        }
+
+        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost;
+}
+
+
+static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t pix1[16*16], pix2[16*16];
+
+    x264_me_t m;
+    int i_ref;
+
+    /* 16x16 Search on all ref frame */
+    m.i_pixel = PIXEL_16x16;
+    m.lm      = a->i_lambda;
+    m.p_fenc  = h->mb.pic.p_fenc[0];
+    m.i_stride= h->mb.pic.i_stride[0];
+    m.b_mvc   = 0;
+    m.i_mv_range = a->i_mv_range;
+
+    /* ME for List 0 ref 0 */
+    m.p_fref = h->mb.pic.p_fref[0][0][0];
+    x264_mb_predict_mv_16x16( h, 0, 0, m.mvp );
+    x264_me_search( h, &m );
+
+    a->l0.i_ref = 0;
+    a->l0.me16x16 = m;
+
+    for( i_ref = 1; i_ref < h->i_ref0; i_ref++ )
+    {
+        /* search with ref */
+        m.p_fref = h->mb.pic.p_fref[0][i_ref][0];
+        x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
+        x264_me_search( h, &m );
+
+        /* add ref cost */
+        m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+
+        if( m.cost < a->l0.me16x16.cost )
+        {
+            a->l0.i_ref = i_ref;
+            a->l0.me16x16 = m;
+        }
+    }
+
+    /* ME for list 1 ref 0 */
+    m.p_fref = h->mb.pic.p_fref[1][0][0];
+    x264_mb_predict_mv_16x16( h, 1, 0, m.mvp );
+    x264_me_search( h, &m );
+
+    a->l1.i_ref = 0;
+    a->l1.me16x16 = m;
+
+    for( i_ref = 1; i_ref < h->i_ref1; i_ref++ )
+    {
+        /* search with ref */
+        m.p_fref = h->mb.pic.p_fref[1][i_ref][0];
+        x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
+        x264_me_search( h, &m );
+
+        /* add ref cost */
+        m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
+
+        if( m.cost < a->l1.me16x16.cost )
+        {
+            a->l1.i_ref = i_ref;
+            a->l1.me16x16 = m;
+        }
+    }
+
+    /* Set global ref, needed for all others modes FIXME some work for mixed block mode */
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
+
+    /* get cost of BI mode */
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
+                    pix1, 16,
+                    a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
+                    16, 16 );
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
+                    pix2, 16,
+                    a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
+                    16, 16 );
+    h->pixf.avg[PIXEL_16x16]( pix1, 16, pix2, 16 );
+
+    a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 ) +
+                       a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ) +
+                                       bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) +
+                                       bs_size_se( a->l0.me16x16.mv[0] - a->l0.me16x16.mvp[0] ) +
+                                       bs_size_se( a->l0.me16x16.mv[1] - a->l0.me16x16.mvp[1] ) +
+                                       bs_size_se( a->l1.me16x16.mv[0] - a->l1.me16x16.mvp[0] ) +
+                                       bs_size_se( a->l1.me16x16.mv[1] - a->l1.me16x16.mvp[1] ) );
+}
+
+/*****************************************************************************
+ * x264_macroblock_analyse:
+ *****************************************************************************/
+void x264_macroblock_analyse( x264_t *h )
+{
+    x264_mb_analysis_t analysis;
+    int i;
+
+    /* qp TODO */
+    h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->pps->i_pic_init_qp + h->sh.i_qp_delta + 0, 0, 51 );
+
+    /* init analysis */
+    x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
+
+    /*--------------------------- Do the analysis ---------------------------*/
+    if( h->sh.i_type == SLICE_TYPE_I )
+    {
+        x264_mb_analyse_intra( h, &analysis );
+
+        if( analysis.i_sad_i4x4 >= 0 &&  analysis.i_sad_i4x4 < analysis.i_sad_i16x16 )
+            h->mb.i_type = I_4x4;
+        else
+            h->mb.i_type = I_16x16;
+    }
+    else if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        const unsigned int i_neighbour = h->mb.i_neighbour;
+
+        int b_skip = 0;
+        int i_cost;
+
+        /* Fast P_SKIP detection */
+        if( analysis.i_qp == h->mb.i_last_qp &&
+            ( ( (i_neighbour&MB_LEFT) && h->mb.type[h->mb.i_mb_xy - 1] == P_SKIP ) ||
+              ( (i_neighbour&MB_TOP) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] == P_SKIP ) ||
+              ( ((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT) ) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] == P_SKIP ) ||
+              ( (i_neighbour&MB_TOPRIGHT) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] == P_SKIP ) ) )
+        {
+            b_skip = x264_macroblock_probe_pskip( h );
+        }
+
+        if( b_skip )
+        {
+            h->mb.i_type = P_SKIP;
+            h->mb.i_partition = D_16x16;
+        }
+        else
+        {
+            const unsigned int flags = h->param.analyse.inter;
+            int i_type;
+            int i_partition;
+
+            x264_mb_analyse_inter_p16x16( h, &analysis );
+            if( flags & X264_ANALYSE_PSUB16x16 )
+                x264_mb_analyse_inter_p8x8( h, &analysis );
+
+            /* Select best inter mode */
+            i_type = P_L0;
+            i_partition = D_16x16;
+            i_cost = analysis.l0.me16x16.cost;
+
+            if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
+                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
+            {
+                int i;
+
+                i_type = P_8x8;
+                i_partition = D_8x8;
+                h->mb.i_sub_partition[0] = D_L0_8x8;
+                h->mb.i_sub_partition[1] = D_L0_8x8;
+                h->mb.i_sub_partition[2] = D_L0_8x8;
+                h->mb.i_sub_partition[3] = D_L0_8x8;
+
+                i_cost = analysis.l0.i_cost8x8;
+
+                /* Do sub 8x8 */
+                if( flags & X264_ANALYSE_PSUB8x8 )
+                {
+                    for( i = 0; i < 4; i++ )
+                    {
+                        x264_mb_analyse_inter_p4x4( h, &analysis, i );
+                        if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
+                        {
+                            int i_cost8x8;
+
+                            h->mb.i_sub_partition[i] = D_L0_4x4;
+                            i_cost8x8 = analysis.l0.i_cost4x4[i];
+
+                            x264_mb_analyse_inter_p8x4( h, &analysis, i );
+                            if( analysis.l0.i_cost8x4[i] < analysis.l0.i_cost4x4[i] )
+                            {
+                                h->mb.i_sub_partition[i] = D_L0_8x4;
+                                i_cost8x8 = analysis.l0.i_cost8x4[i];
+                            }
+
+                            x264_mb_analyse_inter_p4x8( h, &analysis, i );
+                            if( analysis.l0.i_cost4x8[i] < analysis.l0.i_cost4x4[i] )
+                            {
+                                h->mb.i_sub_partition[i] = D_L0_4x8;
+                                i_cost8x8 = analysis.l0.i_cost4x8[i];
+                            }
+
+                            i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
+                        }
+                    }
+                }
+
+                /* Now do sub 16x8/8x16 */
+                x264_mb_analyse_inter_p16x8( h, &analysis );
+                if( analysis.l0.i_cost16x8 < i_cost )
+                {
+                    i_type = P_L0;
+                    i_partition = D_16x8;
+                    i_cost = analysis.l0.i_cost16x8;
+                }
+
+                x264_mb_analyse_inter_p8x16( h, &analysis );
+                if( analysis.l0.i_cost8x16 < i_cost )
+                {
+                    i_type = P_L0;
+                    i_partition = D_8x16;
+                    i_cost = analysis.l0.i_cost8x16;
+                }
+            }
+
+            h->mb.i_type = i_type;
+            h->mb.i_partition = i_partition;
+
+            /* refine qpel */
+            if( h->mb.i_partition == D_16x16 )
+            {
+                x264_me_refine_qpel( h, &analysis.l0.me16x16 );
+                i_cost = analysis.l0.me16x16.cost;
+            }
+            else if( h->mb.i_partition == D_16x8 )
+            {
+                x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
+                x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
+                i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
+            }
+            else if( h->mb.i_partition == D_8x16 )
+            {
+                x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
+                x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
+                i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
+            }
+            else if( h->mb.i_partition == D_8x8 )
+            {
+                int i8x8;
+                i_cost = 0;
+                for( i8x8 = 0; i8x8 < 4; i8x8++ )
+                {
+                    switch( h->mb.i_sub_partition[i8x8] )
+                    {
+                        case D_L0_8x8:
+                            x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
+                            i_cost += analysis.l0.me8x8[i8x8].cost;
+                            break;
+                        case D_L0_8x4:
+                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
+                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
+                            i_cost += analysis.l0.me8x4[i8x8][0].cost +
+                                      analysis.l0.me8x4[i8x8][1].cost;
+                            break;
+                        case D_L0_4x8:
+                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
+                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
+                            i_cost += analysis.l0.me4x8[i8x8][0].cost +
+                                      analysis.l0.me4x8[i8x8][1].cost;
+                            break;
+
+                        case D_L0_4x4:
+                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
+                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
+                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
+                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
+                            i_cost += analysis.l0.me4x4[i8x8][0].cost +
+                                      analysis.l0.me4x4[i8x8][1].cost +
+                                      analysis.l0.me4x4[i8x8][2].cost +
+                                      analysis.l0.me4x4[i8x8][3].cost;
+                            break;
+                        default:
+                            fprintf( stderr, "internal error (!8x8 && !4x4)" );
+                            break;
+                    }
+                }
+            }
+
+            x264_mb_analyse_intra( h, &analysis );
+            if( analysis.i_sad_i16x16 >= 0 && analysis.i_sad_i16x16 < i_cost )
+            {
+                h->mb.i_type = I_16x16;
+                i_cost = analysis.i_sad_i16x16;
+            }
+
+            if( analysis.i_sad_i4x4 >=0 && analysis.i_sad_i4x4 < i_cost )
+            {
+                h->mb.i_type = I_4x4;
+                i_cost = analysis.i_sad_i4x4;
+            }
+        }
+    }
+    else if( h->sh.i_type == SLICE_TYPE_B )
+    {
+        int i_cost;
+
+        /* best inter mode */
+        x264_mb_analyse_inter_b16x16( h, &analysis );
+        h->mb.i_type = B_L0_L0;
+        h->mb.i_partition = D_16x16;
+        i_cost = analysis.l0.me16x16.cost;
+
+        if( analysis.l1.me16x16.cost < i_cost )
+        {
+            h->mb.i_type = B_L1_L1;
+            i_cost = analysis.l1.me16x16.cost;
+        }
+        if( analysis.i_cost16x16bi < i_cost )
+        {
+            h->mb.i_type = B_BI_BI;
+            i_cost = analysis.i_cost16x16bi;
+        }
+
+        /* best intra mode */
+        x264_mb_analyse_intra( h, &analysis );
+        if( analysis.i_sad_i16x16 >= 0 && analysis.i_sad_i16x16 < i_cost )
+        {
+            h->mb.i_type = I_16x16;
+            i_cost = analysis.i_sad_i16x16;
+        }
+        if( analysis.i_sad_i4x4 >=0 && analysis.i_sad_i4x4 < i_cost )
+        {
+            h->mb.i_type = I_4x4;
+            i_cost = analysis.i_sad_i4x4;
+        }
+    }
+#undef BEST_TYPE
+
+    /*-------------------- Update MB from the analysis ----------------------*/
+    h->mb.type[h->mb.i_mb_xy] = h->mb.i_type;
+    switch( h->mb.i_type )
+    {
+        case I_4x4:
+            for( i = 0; i < 16; i++ )
+            {
+                h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
+                    analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
+            }
+
+            x264_mb_analyse_intra_chroma( h, &analysis );
+            h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+        case I_16x16:
+            h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
+
+            x264_mb_analyse_intra_chroma( h, &analysis );
+            h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+
+        case P_L0:
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+                    break;
+
+                case D_16x8:
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].mv[0], analysis.l0.me16x8[0].mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].mv[0], analysis.l0.me16x8[1].mv[1] );
+                    break;
+
+                case D_8x16:
+                    x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].mv[0], analysis.l0.me8x16[0].mv[1] );
+                    x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].mv[0], analysis.l0.me8x16[1].mv[1] );
+                    break;
+
+                default:
+                    fprintf( stderr, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
+                    break;
+            }
+            break;
+
+        case P_8x8:
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+            for( i = 0; i < 4; i++ )
+            {
+                const int x = 2*(i%2);
+                const int y = 2*(i/2);
+
+                switch( h->mb.i_sub_partition[i] )
+                {
+                    case D_L0_8x8:
+                        x264_macroblock_cache_mv( h, x, y, 2, 2, 0, analysis.l0.me8x8[i].mv[0], analysis.l0.me8x8[i].mv[1] );
+                        break;
+                    case D_L0_8x4:
+                        x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, analysis.l0.me8x4[i][0].mv[0], analysis.l0.me8x4[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, analysis.l0.me8x4[i][1].mv[0], analysis.l0.me8x4[i][1].mv[1] );
+                        break;
+                    case D_L0_4x8:
+                        x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, analysis.l0.me4x8[i][0].mv[0], analysis.l0.me4x8[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, analysis.l0.me4x8[i][1].mv[0], analysis.l0.me4x8[i][1].mv[1] );
+                        break;
+                    case D_L0_4x4:
+                        x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, analysis.l0.me4x4[i][0].mv[0], analysis.l0.me4x4[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, analysis.l0.me4x4[i][1].mv[0], analysis.l0.me4x4[i][1].mv[1] );
+                        x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, analysis.l0.me4x4[i][2].mv[0], analysis.l0.me4x4[i][2].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, analysis.l0.me4x4[i][3].mv[0], analysis.l0.me4x4[i][3].mv[1] );
+                        break;
+                    default:
+                        fprintf( stderr, "internal error\n" );
+                        break;
+                }
+            }
+            break;
+
+        case P_SKIP:
+        {
+            int mvp[2];
+            x264_mb_predict_mv_pskip( h, mvp );
+            /* */
+            h->mb.i_partition = D_16x16;
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
+            x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, mvp[0], mvp[1] );
+            break;
+        }
+
+        case B_L0_L0:
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
+                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1,  0, 0 );
+                    break;
+                default:
+                    fprintf( stderr, "internal error\n" );
+                    break;
+            }
+            break;
+        case B_L1_L1:
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
+                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
+
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
+                    break;
+
+                default:
+                    fprintf( stderr, "internal error\n" );
+                    break;
+            }
+            break;
+        case B_BI_BI:
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
+                    break;
+
+                default:
+                    fprintf( stderr, "internal error\n" );
+                    break;
+            }
+            break;
+
+        default:
+            fprintf( stderr, "internal error (invalid MB type)\n" );
+            break;
+    }
+}
+
diff --git a/encoder/analyse.h b/encoder/analyse.h
new file mode 100644
index 00000000..8e591e89
--- /dev/null
+++ b/encoder/analyse.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * analyse.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: analyse.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ANALYSE_H
+#define _ANALYSE_H 1
+
+void x264_macroblock_analyse( x264_t *h );
+
+#endif
diff --git a/encoder/cabac.c b/encoder/cabac.c
new file mode 100644
index 00000000..8b1f3965
--- /dev/null
+++ b/encoder/cabac.c
@@ -0,0 +1,1195 @@
+/*****************************************************************************
+ * cabac.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cabac.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "macroblock.h"
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static void x264_cabac_mb_type( x264_t *h )
+{
+    const int i_mb_type = h->mb.i_type;
+
+    if( h->sh.i_type == SLICE_TYPE_I )
+    {
+        int ctx = 0;
+        if( h->mb.i_mb_x > 0 && h->mb.type[h->mb.i_mb_xy - 1] != I_4x4 )
+        {
+            ctx++;
+        }
+        if( h->mb.i_mb_y > 0 && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] != I_4x4 )
+        {
+            ctx++;
+        }
+
+        if( i_mb_type == I_4x4 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + ctx, 0 );
+        }
+        else if( i_mb_type == I_PCM )
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + ctx, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 1 );
+        }
+        else    /* I_16x16 */
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + ctx, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 0 );
+
+            x264_cabac_encode_decision( &h->cabac, 3 + 3, ( h->mb.i_cbp_luma == 0 ? 0 : 1 ));
+            if( h->mb.i_cbp_chroma == 0 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 3 + 4, 0 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 3 + 4, 1 );
+                x264_cabac_encode_decision( &h->cabac, 3 + 5, ( h->mb.i_cbp_chroma == 1 ? 0 : 1 ) );
+            }
+            x264_cabac_encode_decision( &h->cabac, 3 + 6, ( (h->mb.i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+            x264_cabac_encode_decision( &h->cabac, 3 + 7, ( (h->mb.i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        }
+    }
+    else if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        /* prefix: 14, suffix: 17 */
+        if( i_mb_type == P_L0 )
+        {
+            if( h->mb.i_partition == D_16x16 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 0 );
+                x264_cabac_encode_decision( &h->cabac, 16, 0 );
+            }
+            else if( h->mb.i_partition == D_16x8 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            }
+            else if( h->mb.i_partition == D_8x16 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17, 0 );
+            }
+        }
+        else if( i_mb_type == P_8x8 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 14, 0 );
+            x264_cabac_encode_decision( &h->cabac, 15, 0 );
+            x264_cabac_encode_decision( &h->cabac, 16, 1 );
+        }
+        else if( i_mb_type == I_4x4 )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            x264_cabac_encode_decision( &h->cabac, 17, 0 );
+        }
+        else if( i_mb_type == I_PCM )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 1 ); /*ctxIdx == 276 */
+        }
+        else /* intra 16x16 */
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            /* suffix */
+            x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 0 ); /*ctxIdx == 276 */
+
+            x264_cabac_encode_decision( &h->cabac, 17+1, ( h->mb.i_cbp_luma == 0 ? 0 : 1 ));
+            if( h->mb.i_cbp_chroma == 0 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 17+2, 0 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 17+2, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17+2, ( h->mb.i_cbp_chroma == 1 ? 0 : 1 ) );
+            }
+            x264_cabac_encode_decision( &h->cabac, 17+3, ( (h->mb.i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+            x264_cabac_encode_decision( &h->cabac, 17+3, ( (h->mb.i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        }
+    }
+    else if( h->sh.i_type == SLICE_TYPE_B )
+    {
+        int ctx = 0;
+        if( h->mb.i_mb_x > 0 && h->mb.type[h->mb.i_mb_xy - 1] != B_SKIP && h->mb.type[h->mb.i_mb_xy - 1] != B_DIRECT )
+        {
+            ctx++;
+        }
+        if( h->mb.i_mb_y > 0 && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] != B_SKIP && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] != B_DIRECT )
+        {
+            ctx++;
+        }
+
+        if( i_mb_type == B_DIRECT )
+        {
+            x264_cabac_encode_decision( &h->cabac, 27+ctx, 0 );
+        }
+        else if( i_mb_type == B_8x8 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 27+ctx, 1 );
+            x264_cabac_encode_decision( &h->cabac, 27+3,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+4,   1 );
+
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+        }
+        else if( IS_INTRA( i_mb_type ) )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 27+ctx, 1 );
+            x264_cabac_encode_decision( &h->cabac, 27+3,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+4,   1 );
+
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+5,   0 );
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+
+            /* Suffix */
+            if( i_mb_type == I_4x4 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 32, 0 );
+            }
+            else if( i_mb_type == I_PCM )
+            {
+                x264_cabac_encode_decision( &h->cabac, 32, 1 );
+                x264_cabac_encode_terminal( &h->cabac,     1 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 32, 1 );
+                x264_cabac_encode_terminal( &h->cabac,     0 );
+
+                /* TODO */
+                x264_cabac_encode_decision( &h->cabac, 32+1, ( h->mb.i_cbp_luma == 0 ? 0 : 1 ));
+                if( h->mb.i_cbp_chroma == 0 )
+                {
+                    x264_cabac_encode_decision( &h->cabac, 32+2, 0 );
+                }
+                else
+                {
+                    x264_cabac_encode_decision( &h->cabac, 32+2, 1 );
+                    x264_cabac_encode_decision( &h->cabac, 32+2, ( h->mb.i_cbp_chroma == 1 ? 0 : 1 ) );
+                }
+                x264_cabac_encode_decision( &h->cabac, 32+3, ( (h->mb.i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+                x264_cabac_encode_decision( &h->cabac, 32+3, ( (h->mb.i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+            }
+        }
+        else
+        {
+            static const int i_mb_len[21] =
+            {
+                3, 6, 6,    /* L0 L0 */
+                3, 6, 6,    /* L1 L1 */
+                6, 7, 7,    /* BI BI */
+
+                6, 6,       /* L0 L1 */
+                6, 6,       /* L1 L0 */
+                7, 7,       /* L0 BI */
+                7, 7,       /* L1 BI */
+                7, 7,       /* BI L0 */
+                7, 7,       /* BI L1 */
+            };
+            static const int i_mb_bits[21][7] =
+            {
+                { 1, 0, 0, },            { 1, 1, 0, 0, 0, 1, },    { 1, 1, 0, 0, 1, 0, },   /* L0 L0 */
+                { 1, 0, 1, },            { 1, 1, 0, 0, 1, 1, },    { 1, 1, 0, 1, 0, 0, },   /* L1 L1 */
+                { 1, 1, 0, 0, 0, 0 ,},   { 1, 1, 1, 1, 0, 0 , 0 }, { 1, 1, 1, 1, 0, 0 , 1 },/* BI BI */
+
+                { 1, 1, 0, 1, 0, 1, },   { 1, 1, 0, 1, 1, 0, },     /* L0 L1 */
+                { 1, 1, 0, 1, 1, 1, },   { 1, 1, 1, 1, 1, 0, },     /* L1 L0 */
+                { 1, 1, 1, 0, 0, 0, 0 }, { 1, 1, 1, 0, 0, 0, 1 },   /* L0 BI */
+                { 1, 1, 1, 0, 0, 1, 0 }, { 1, 1, 1, 0, 0, 1, 1 },   /* L1 BI */
+                { 1, 1, 1, 0, 1, 0, 0 }, { 1, 1, 1, 0, 1, 0, 1 },   /* BI L0 */
+                { 1, 1, 1, 0, 1, 1, 0 }, { 1, 1, 1, 0, 1, 1, 1 }    /* BI L1 */
+            };
+
+            const int i_partition = h->mb.i_partition;
+            int idx = 0;
+            int i;
+            switch( i_mb_type )
+            {
+                /* D_16x16, D_16x8, D_8x16 */
+                case B_BI_BI: idx += 3;
+                case B_L1_L1: idx += 3;
+                case B_L0_L0:
+                    if( i_partition == D_16x8 )
+                        idx += 1;
+                    else if( i_partition == D_8x16 )
+                        idx += 2;
+                    break;
+
+                /* D_16x8, D_8x16 */
+                case B_BI_L1: idx += 2;
+                case B_BI_L0: idx += 2;
+                case B_L1_BI: idx += 2;
+                case B_L0_BI: idx += 2;
+                case B_L1_L0: idx += 2;
+                case B_L0_L1:
+                    idx += 3*3;
+                    if( i_partition == D_8x16 )
+                        idx++;
+                    break;
+                default:
+                    fprintf( stderr, "error in B mb type\n" );
+                    return;
+            }
+
+            x264_cabac_encode_decision( &h->cabac, 27+ctx,                         i_mb_bits[idx][0] );
+            x264_cabac_encode_decision( &h->cabac, 27+3,                           i_mb_bits[idx][1] );
+            x264_cabac_encode_decision( &h->cabac, 27+(i_mb_bits[idx][1] != 0 ? 4 : 5), i_mb_bits[idx][2] );
+            for( i = 3; i < i_mb_len[idx]; i++ )
+            {
+                x264_cabac_encode_decision( &h->cabac, 27+5,                       i_mb_bits[idx][i] );
+            }
+        }
+    }
+    else
+    {
+        fprintf( stderr, "unknown SLICE_TYPE unsupported in x264_macroblock_write_cabac\n" );
+    }
+}
+
+static void x264_cabac_mb_intra4x4_pred_mode( x264_t *h, int i_pred, int i_mode )
+{
+    if( i_pred == i_mode )
+    {
+        /* b_prev_intra4x4_pred_mode */
+        x264_cabac_encode_decision( &h->cabac, 68, 1 );
+    }
+    else
+    {
+        /* b_prev_intra4x4_pred_mode */
+        x264_cabac_encode_decision( &h->cabac, 68, 0 );
+        if( i_mode > i_pred  )
+        {
+            i_mode--;
+        }
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode     )&0x01 );
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 1)&0x01 );
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 2)&0x01 );
+    }
+}
+static void x264_cabac_mb_intra8x8_pred_mode( x264_t *h )
+{
+    const int i_mode  = h->mb.i_chroma_pred_mode;
+    int       ctx = 0;
+
+    /* No need to test for I4x4 or I_16x16 as cache_save handle that */
+    if( h->mb.i_mb_x > 0 && h->mb.chroma_pred_mode[h->mb.i_mb_xy - 1] != 0 )
+    {
+        ctx++;
+    }
+    if( h->mb.i_mb_y > 0 && h->mb.chroma_pred_mode[h->mb.i_mb_xy - h->mb.i_mb_stride] != 0 )
+    {
+        ctx++;
+    }
+
+    if( i_mode == 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 64 + ctx, 0 );
+    }
+    else
+    {
+        x264_cabac_encode_decision( &h->cabac, 64 + ctx, 1 );
+        x264_cabac_encode_decision( &h->cabac, 64 + 3, ( i_mode == 1 ? 0 : 1 ) );
+        if( i_mode > 1 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 64 + 3, ( i_mode == 2 ? 0 : 1 ) );
+        }
+    }
+}
+
+static void x264_cabac_mb_cbp_luma( x264_t *h )
+{
+    /* TODO: clean up and optimize */
+    int i8x8;
+    for( i8x8 = 0; i8x8 < 4; i8x8++ )
+    {
+        int i_mba_xy = -1;
+        int i_mbb_xy = -1;
+        int x = block_idx_x[4*i8x8];
+        int y = block_idx_y[4*i8x8];
+        int ctx = 0;
+
+        if( x > 0 )
+            i_mba_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_x > 0 )
+            i_mba_xy = h->mb.i_mb_xy - 1;
+
+        if( y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+
+
+        /* No need to test for PCM and SKIP */
+        if( i_mba_xy >= 0 )
+        {
+            const int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+            if( ((h->mb.cbp[i_mba_xy] >> i8x8a)&0x01) == 0 )
+            {
+                ctx++;
+            }
+        }
+
+        if( i_mbb_xy >= 0 )
+        {
+            const int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+            if( ((h->mb.cbp[i_mbb_xy] >> i8x8b)&0x01) == 0 )
+            {
+                ctx += 2;
+            }
+        }
+
+        x264_cabac_encode_decision( &h->cabac, 73 + ctx, (h->mb.i_cbp_luma >> i8x8)&0x01 );
+    }
+}
+
+static void x264_cabac_mb_cbp_chroma( x264_t *h )
+{
+    int cbp_a = -1;
+    int cbp_b = -1;
+    int ctx;
+
+    /* No need to test for SKIP/PCM */
+    if( h->mb.i_mb_x > 0 )
+    {
+        cbp_a = (h->mb.cbp[h->mb.i_mb_xy - 1] >> 4)&0x3;
+    }
+
+    if( h->mb.i_mb_y > 0 )
+    {
+        cbp_b = (h->mb.cbp[h->mb.i_mb_xy - h->mb.i_mb_stride] >> 4)&0x3;
+    }
+
+    ctx = 0;
+    if( cbp_a > 0 ) ctx++;
+    if( cbp_b > 0 ) ctx += 2;
+    if( h->mb.i_cbp_chroma == 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 77 + ctx, 0 );
+    }
+    else
+    {
+        x264_cabac_encode_decision( &h->cabac, 77 + ctx, 1 );
+
+        ctx = 4;
+        if( cbp_a == 2 ) ctx++;
+        if( cbp_b == 2 ) ctx += 2;
+        x264_cabac_encode_decision( &h->cabac, 77 + ctx, h->mb.i_cbp_chroma > 1 ? 1 : 0 );
+    }
+}
+
+/* TODO check it with != qp per mb */
+static void x264_cabac_mb_qp_delta( x264_t *h )
+{
+    int i_mbn_xy = h->mb.i_mb_xy - 1;
+    int i_dqp = h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp;
+    int val = i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp - 1);
+    int ctx;
+
+    /* No need to test for PCM / SKIP */
+    if( i_mbn_xy >= 0 && h->mb.i_last_dqp != 0 &&
+        ( h->mb.type[i_mbn_xy] == I_16x16 || (h->mb.cbp[i_mbn_xy]&0x3f) ) )
+        ctx = 1;
+    else
+        ctx = 0;
+
+    while( val > 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac,  60 + ctx, 1 );
+        if( ctx < 2 )
+            ctx = 2;
+        else
+            ctx = 3;
+        val--;
+    }
+    x264_cabac_encode_decision( &h->cabac,  60 + ctx, 0 );
+}
+
+void x264_cabac_mb_skip( x264_t *h, int b_skip )
+{
+    int ctx = 0;
+
+    if( h->mb.i_mb_x > 0 && !IS_SKIP( h->mb.type[h->mb.i_mb_xy -1]) )
+    {
+        ctx++;
+    }
+    if( h->mb.i_mb_y > 0 && !IS_SKIP( h->mb.type[h->mb.i_mb_xy -h->mb.i_mb_stride]) )
+    {
+        ctx++;
+    }
+
+    if( h->sh.i_type == SLICE_TYPE_P )
+        x264_cabac_encode_decision( &h->cabac, 11 + ctx, b_skip ? 1 : 0 );
+    else /* SLICE_TYPE_B */
+        x264_cabac_encode_decision( &h->cabac, 24 + ctx, b_skip ? 1 : 0 );
+}
+
+static inline void x264_cabac_mb_sub_p_partition( x264_t *h, int i_sub )
+{
+    if( i_sub == D_L0_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 21, 1 );
+    }
+    else if( i_sub == D_L0_8x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 21, 0 );
+        x264_cabac_encode_decision( &h->cabac, 22, 0 );
+    }
+    else if( i_sub == D_L0_4x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 21, 0 );
+        x264_cabac_encode_decision( &h->cabac, 22, 1 );
+        x264_cabac_encode_decision( &h->cabac, 23, 1 );
+    }
+    else if( i_sub == D_L0_4x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 21, 0 );
+        x264_cabac_encode_decision( &h->cabac, 22, 1 );
+        x264_cabac_encode_decision( &h->cabac, 23, 0 );
+    }
+}
+
+static inline void x264_cabac_mb_sub_b_partition( x264_t *h, int i_sub )
+{
+    if( i_sub == D_DIRECT_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 0 );
+    }
+    else if( i_sub == D_L0_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 0 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+    }
+    else if( i_sub == D_L1_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 0 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+    }
+    else if( i_sub == D_BI_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_L0_8x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+    else if( i_sub == D_L0_4x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_L1_8x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+    else if( i_sub == D_L1_4x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_BI_8x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+    else if( i_sub == D_BI_4x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_L0_4x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+    else if( i_sub == D_L1_4x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_BI_4x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+}
+
+static inline void x264_cabac_mb_ref( x264_t *h, int i_list, int idx )
+{
+    const int i8 = x264_scan8[idx];
+    const int i_refa = h->mb.cache.ref[i_list][i8 - 1];
+    const int i_refb = h->mb.cache.ref[i_list][i8 - 8];
+    int i_ref  = h->mb.cache.ref[i_list][i8];
+    int ctx  = 0;
+
+    if( i_refa > 0 )
+        ctx++;
+    if( i_refb > 0 )
+        ctx += 2;
+
+    while( i_ref > 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 54 + ctx, 1 );
+        if( ctx < 4 )
+            ctx = 4;
+        else
+            ctx = 5;
+
+        i_ref--;
+    }
+    x264_cabac_encode_decision( &h->cabac, 54 + ctx, 0 );
+}
+
+
+
+static inline void  x264_cabac_mb_mvd_cpn( x264_t *h, int i_list, int idx, int l, int mvd )
+{
+    const int amvd = abs( h->mb.cache.mvd[i_list][x264_scan8[idx] - 1][l] ) +
+                     abs( h->mb.cache.mvd[i_list][x264_scan8[idx] - 8][l] );
+    const int i_abs = abs( mvd );
+    const int i_prefix = X264_MIN( i_abs, 9 );
+    const int ctxbase = (l == 0 ? 40 : 47);
+    int ctx;
+    int i;
+
+
+    if( amvd < 3 )
+        ctx = 0;
+    else if( amvd > 32 )
+        ctx = 2;
+    else
+        ctx = 1;
+
+    for( i = 0; i < i_prefix; i++ )
+    {
+        x264_cabac_encode_decision( &h->cabac, ctxbase + ctx, 1 );
+        if( ctx < 3 )
+            ctx = 3;
+        else if( ctx < 6 )
+            ctx++;
+    }
+    if( i_prefix < 9 )
+    {
+        x264_cabac_encode_decision( &h->cabac, ctxbase + ctx, 0 );
+    }
+
+    if( i_prefix >= 9 )
+    {
+        int i_suffix = i_abs - 9;
+        int k = 3;
+
+        while( i_suffix >= (1<<k) )
+        {
+            x264_cabac_encode_bypass( &h->cabac, 1 );
+            i_suffix -= 1 << k;
+            k++;
+        }
+        x264_cabac_encode_bypass( &h->cabac, 0 );
+        while( k-- )
+        {
+            x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+        }
+    }
+
+    /* sign */
+    if( mvd > 0 )
+        x264_cabac_encode_bypass( &h->cabac, 0 );
+    else if( mvd < 0 )
+        x264_cabac_encode_bypass( &h->cabac, 1 );
+}
+
+static inline void  x264_cabac_mb_mvd( x264_t *h, int i_list, int idx, int width, int height )
+{
+    int mvp[2];
+    int mdx, mdy;
+
+    /* Calculate mvd */
+    x264_mb_predict_mv( h, i_list, idx, width, mvp );
+    mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
+    mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
+
+    /* encode */
+    x264_cabac_mb_mvd_cpn( h, i_list, idx, 0, mdx );
+    x264_cabac_mb_mvd_cpn( h, i_list, idx, 1, mdy );
+
+    /* save value */
+    x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mdx, mdy );
+}
+
+static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
+{
+    /* TODO: clean up/optimize */
+    int i_mba_xy = -1;
+    int i_mbb_xy = -1;
+    int i_nza = -1;
+    int i_nzb = -1;
+    int ctx = 0;
+
+    if( i_cat == 0 )
+    {
+        if( h->mb.i_mb_x > 0 )
+        {
+            i_mba_xy = h->mb.i_mb_xy -1;
+            if( h->mb.type[i_mba_xy] == I_16x16 )
+            {
+                i_nza = h->mb.cbp[i_mba_xy]&0x100;
+            }
+        }
+        if( h->mb.i_mb_y > 0 )
+        {
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+            if( h->mb.type[i_mbb_xy] == I_16x16 )
+            {
+                i_nzb = h->mb.cbp[i_mbb_xy]&0x100;
+            }
+        }
+    }
+    else if( i_cat == 1 || i_cat == 2 )
+    {
+        int x = block_idx_x[i_idx];
+        int y = block_idx_y[i_idx];
+
+        if( x > 0 )
+            i_mba_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_x > 0 )
+            i_mba_xy = h->mb.i_mb_xy -1;
+
+        if( y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+
+        /* no need to test for skip/pcm */
+        if( i_mba_xy >= 0 )
+        {
+            const int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+            if( (h->mb.cbp[i_mba_xy]&0x0f)>> i8x8a )
+            {
+                i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
+            }
+        }
+        if( i_mbb_xy >= 0 )
+        {
+            const int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+            if( (h->mb.cbp[i_mbb_xy]&0x0f)>> i8x8b )
+            {
+                i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
+            }
+        }
+    }
+    else if( i_cat == 3 )
+    {
+        /* no need to test skip/pcm */
+        if( h->mb.i_mb_x > 0 )
+        {
+            i_mba_xy = h->mb.i_mb_xy -1;
+            if( h->mb.cbp[i_mba_xy]&0x30 )
+            {
+                i_nza = h->mb.cbp[i_mba_xy]&( 0x02 << ( 8 + i_idx) );
+            }
+        }
+        if( h->mb.i_mb_y > 0 )
+        {
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+            if( h->mb.cbp[i_mbb_xy]&0x30 )
+            {
+                i_nzb = h->mb.cbp[i_mbb_xy]&( 0x02 << ( 8 + i_idx) );
+            }
+        }
+    }
+    else if( i_cat == 4 )
+    {
+        int idxc = i_idx% 4;
+
+        if( idxc == 1 || idxc == 3 )
+            i_mba_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_x > 0 )
+            i_mba_xy = h->mb.i_mb_xy - 1;
+
+        if( idxc == 2 || idxc == 3 )
+            i_mbb_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+
+        /* no need to test skip/pcm */
+        if( i_mba_xy >= 0 && (h->mb.cbp[i_mba_xy]&0x30) == 0x20 )
+        {
+            i_nza = h->mb.cache.non_zero_count[x264_scan8[16+i_idx] - 1];
+        }
+        if( i_mbb_xy >= 0 && (h->mb.cbp[i_mbb_xy]&0x30) == 0x20 )
+        {
+            i_nzb = h->mb.cache.non_zero_count[x264_scan8[16+i_idx] - 8];
+        }
+    }
+
+    if( ( i_mba_xy < 0  && IS_INTRA( h->mb.i_type ) ) || i_nza > 0 )
+    {
+        ctx++;
+    }
+    if( ( i_mbb_xy < 0  && IS_INTRA( h->mb.i_type ) ) || i_nzb > 0 )
+    {
+        ctx += 2;
+    }
+
+    return 4 * i_cat + ctx;
+}
+
+
+static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx, int *l, int i_count )
+{
+    static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 };
+
+    int i_coeff_abs_m1[16];
+    int i_coeff_sign[16];
+    int i_coeff = 0;
+    int i_last  = 0;
+
+    int i_abslevel1 = 0;
+    int i_abslevelgt1 = 0;
+
+    int i;
+
+    /* i_ctxBlockCat: 0-> DC 16x16  i_idx = 0
+     *                1-> AC 16x16  i_idx = luma4x4idx
+     *                2-> Luma4x4   i_idx = luma4x4idx
+     *                3-> DC Chroma i_idx = iCbCr
+     *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+     */
+
+    //fprintf( stderr, "l[] = " );
+    for( i = 0; i < i_count; i++ )
+    {
+        //fprintf( stderr, "%d ", l[i] );
+        if( l[i] != 0 )
+        {
+            i_coeff_abs_m1[i_coeff] = abs( l[i] ) - 1;
+            i_coeff_sign[i_coeff]   = ( l[i] < 0 ? 1 : 0);
+            i_coeff++;
+
+            i_last = i;
+        }
+    }
+    //fprintf( stderr, "\n" );
+
+    if( i_coeff == 0 )
+    {
+        /* codec block flag */
+        x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), 0 );
+        return;
+    }
+
+    /* block coded */
+    x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), 1 );
+    for( i = 0; i < i_count - 1; i++ )
+    {
+        int i_ctxIdxInc;
+
+        i_ctxIdxInc = X264_MIN( i, i_count - 2 );
+
+        if( l[i] != 0 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 1 );
+            x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, i == i_last ? 1 : 0 );
+        }
+        else
+        {
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 0 );
+        }
+        if( i == i_last )
+        {
+            break;
+        }
+    }
+
+    for( i = i_coeff - 1; i >= 0; i-- )
+    {
+        int i_prefix;
+        int i_ctxIdxInc;
+
+        /* write coeff_abs - 1 */
+
+        /* prefix */
+        i_prefix = X264_MIN( i_coeff_abs_m1[i], 14 );
+
+        i_ctxIdxInc = (i_abslevelgt1 != 0 ? 0 : X264_MIN( 4, i_abslevel1 + 1 )) + coeff_abs_level_m1_offset[i_ctxBlockCat];
+        if( i_prefix == 0 )
+        {
+            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+        }
+        else
+        {
+            int j;
+            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            i_ctxIdxInc = 5 + X264_MIN( 4, i_abslevelgt1 ) + coeff_abs_level_m1_offset[i_ctxBlockCat];
+            for( j = 0; j < i_prefix - 1; j++ )
+            {
+                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            }
+            if( i_prefix < 14 )
+            {
+                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+            }
+        }
+        /* suffix */
+        if( i_coeff_abs_m1[i] >= 14 )
+        {
+            int k = 0;
+            int i_suffix = i_coeff_abs_m1[i] - 14;
+
+            while( i_suffix >= (1<<k) )
+            {
+                x264_cabac_encode_bypass( &h->cabac, 1 );
+                i_suffix -= 1 << k;
+                k++;
+            }
+            x264_cabac_encode_bypass( &h->cabac, 0 );
+            while( k-- )
+            {
+                x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+            }
+        }
+
+        /* write sign */
+        x264_cabac_encode_bypass( &h->cabac, i_coeff_sign[i] );
+
+
+        if( i_coeff_abs_m1[i] == 0 )
+        {
+            i_abslevel1++;
+        }
+        else
+        {
+            i_abslevelgt1++;
+        }
+    }
+}
+
+
+
+void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
+{
+    const int i_mb_type = h->mb.i_type;
+    int i;
+
+    /* Write the MB type */
+    x264_cabac_mb_type( h );
+
+    /* PCM special block type UNTESTED */
+    if( i_mb_type == I_PCM )
+    {
+        bs_align_0( s );    /* not sure */
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            const int x = 16 * h->mb.i_mb_x + (i % 16);
+            const int y = 16 * h->mb.i_mb_y + (i / 16);
+            bs_write( s, 8, h->fenc->plane[0][y*h->mb.pic.i_stride[0]+x] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[1][y*h->mb.pic.i_stride[1]+x] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[2][y*h->mb.pic.i_stride[2]+x] );
+        }
+        x264_cabac_encode_init( &h->cabac, s );
+        return;
+    }
+
+    if( IS_INTRA( i_mb_type ) )
+    {
+        /* Prediction */
+        if( i_mb_type == I_4x4 )
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
+                const int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+                x264_cabac_mb_intra4x4_pred_mode( h, i_pred, i_mode );
+            }
+        }
+        x264_cabac_mb_intra8x8_pred_mode( h );
+    }
+    else if( i_mb_type == P_L0 )
+    {
+        if( h->mb.i_partition == D_16x16 )
+        {
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                x264_cabac_mb_ref( h, 0, 0 );
+            }
+            x264_cabac_mb_mvd( h, 0, 0, 4, 4 );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                x264_cabac_mb_ref( h, 0, 0 );
+                x264_cabac_mb_ref( h, 0, 8 );
+            }
+            x264_cabac_mb_mvd( h, 0, 0, 4, 2 );
+            x264_cabac_mb_mvd( h, 0, 8, 4, 2 );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                x264_cabac_mb_ref( h, 0, 0 );
+                x264_cabac_mb_ref( h, 0, 4 );
+            }
+            x264_cabac_mb_mvd( h, 0, 0, 2, 4 );
+            x264_cabac_mb_mvd( h, 0, 4, 2, 4 );
+        }
+    }
+    else if( i_mb_type == P_8x8 )
+    {
+        /* sub mb type */
+        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[0] );
+        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[1] );
+        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[2] );
+        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[3] );
+
+        /* ref 0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            x264_cabac_mb_ref( h, 0, 0 );
+            x264_cabac_mb_ref( h, 0, 4 );
+            x264_cabac_mb_ref( h, 0, 8 );
+            x264_cabac_mb_ref( h, 0, 12 );
+        }
+
+        for( i = 0; i < 4; i++ )
+        {
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    x264_cabac_mb_mvd( h, 0, 4*i, 2, 2 );
+                    break;
+                case D_L0_8x4:
+                    x264_cabac_mb_mvd( h, 0, 4*i+0, 2, 1 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+2, 2, 1 );
+                    break;
+                case D_L0_4x8:
+                    x264_cabac_mb_mvd( h, 0, 4*i+0, 1, 2 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+1, 1, 2 );
+                    break;
+                case D_L0_4x4:
+                    x264_cabac_mb_mvd( h, 0, 4*i+0, 1, 1 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+1, 1, 1 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+2, 1, 1 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+3, 1, 1 );
+                    break;
+            }
+        }
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        /* TODO */
+        fprintf( stderr, "Arggg B_8x8\n" );
+        return;
+    }
+    else if( i_mb_type != B_DIRECT )
+    {
+        /* All B mode */
+        int i_list;
+        int b_list[2][2];
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i];
+            b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
+        }
+
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            const int i_ref_max = i_list == 0 ? h->sh.i_num_ref_idx_l0_active : h->sh.i_num_ref_idx_l1_active;
+
+            if( i_ref_max > 1 )
+            {
+                if( h->mb.i_partition == D_16x16 )
+                {
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
+                }
+                else if( h->mb.i_partition == D_16x8 )
+                {
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
+                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, i_list, 8 );
+                }
+                else if( h->mb.i_partition == D_8x16 )
+                {
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
+                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, i_list, 4 );
+                }
+            }
+        }
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            if( h->mb.i_partition == D_16x16 )
+            {
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 4, 4 );
+            }
+            else if( h->mb.i_partition == D_16x8 )
+            {
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 4, 2 );
+                if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, i_list, 8, 4, 2 );
+            }
+            else if( h->mb.i_partition == D_8x16 )
+            {
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 2, 4 );
+                if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, i_list, 4, 2, 4 );
+            }
+        }
+    }
+
+    if( i_mb_type != I_16x16 )
+    {
+        x264_cabac_mb_cbp_luma( h );
+        x264_cabac_mb_cbp_chroma( h );
+    }
+
+    if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
+    {
+        x264_cabac_mb_qp_delta( h );
+
+        /* write residual */
+        if( i_mb_type == I_16x16 )
+        {
+            /* DC Luma */
+            block_residual_write_cabac( h, 0, 0, h->dct.luma16x16_dc, 16 );
+
+            if( h->mb.i_cbp_luma != 0 )
+            {
+                /* AC Luma */
+                for( i = 0; i < 16; i++ )
+                {
+                    block_residual_write_cabac( h, 1, i, h->dct.block[i].residual_ac, 15 );
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
+                {
+                    block_residual_write_cabac( h, 2, i, h->dct.block[i].luma4x4, 16 );
+                }
+            }
+        }
+
+        if( h->mb.i_cbp_chroma &0x03 )    /* Chroma DC residual present */
+        {
+            block_residual_write_cabac( h, 3, 0, h->dct.chroma_dc[0], 4 );
+            block_residual_write_cabac( h, 3, 1, h->dct.chroma_dc[1], 4 );
+        }
+        if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cabac( h, 4, i, h->dct.block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
+
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
new file mode 100644
index 00000000..de7ba547
--- /dev/null
+++ b/encoder/cavlc.c
@@ -0,0 +1,688 @@
+/*****************************************************************************
+ * cavlc.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cavlc.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "macroblock.h"
+
+static const uint8_t intra4x4_cbp_to_golomb[48]=
+{
+  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+ 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+ 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
+};
+static const uint8_t inter_cbp_to_golomb[48]=
+{
+  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+};
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+static inline void bs_write_vlc( bs_t *s, vlc_t v )
+{
+    bs_write( s, v.i_size, v.i_bits );
+}
+
+/****************************************************************************
+ * block_residual_write_cavlc:
+ ****************************************************************************/
+static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int *l, int i_count )
+{
+    int level[16], run[16];
+    int i_total, i_trailing;
+    int i_total_zero;
+    int i_last;
+    unsigned int i_sign;
+
+    int i;
+    int i_zero_left;
+    int i_suffix_length;
+
+    /* first find i_last */
+    i_last = i_count - 1;
+    while( i_last >= 0 && l[i_last] == 0 )
+    {
+        i_last--;
+    }
+
+    i_sign = 0;
+    i_total = 0;
+    i_trailing = 0;
+    i_total_zero = 0;
+
+    if( i_last >= 0 )
+    {
+        int b_trailing = 1;
+        int idx = 0;
+
+        /* level and run and total */
+        while( i_last >= 0 )
+        {
+            level[idx] = l[i_last--];
+
+            run[idx] = 0;
+            while( i_last >= 0 && l[i_last] == 0 )
+            {
+                run[idx]++;
+                i_last--;
+            }
+
+            i_total++;
+            i_total_zero += run[idx];
+
+            if( b_trailing && abs( level[idx] ) == 1 && i_trailing < 3 )
+            {
+                i_sign <<= 1;
+                if( level[idx] < 0 )
+                {
+                    i_sign |= 0x01;
+                }
+
+                i_trailing++;
+            }
+            else
+            {
+                b_trailing = 0;
+            }
+
+            idx++;
+        }
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
+    }
+    else
+    {
+        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = x264_mb_predict_non_zero_code( h, 0 );
+        }
+        else
+        {
+            nC = x264_mb_predict_non_zero_code( h, i_idx );
+        }
+
+        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total*4+i_trailing] );
+    }
+
+    if( i_total <= 0 )
+    {
+        return;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+    if( i_trailing > 0 )
+    {
+        bs_write( s, i_trailing, i_sign );
+    }
+    for( i = i_trailing; i < i_total; i++ )
+    {
+        int i_level_code;
+
+        /* calculate level code */
+        if( level[i] < 0 )
+        {
+            i_level_code = -2*level[i] - 1;
+        }
+        else /* if( level[i] > 0 ) */
+        {
+            i_level_code = 2 * level[i] - 2;
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code -=2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        }
+
+        if( ( i_level_code >> i_suffix_length ) < 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[i_level_code >> i_suffix_length] );
+            if( i_suffix_length > 0 )
+            {
+                bs_write( s, i_suffix_length, i_level_code );
+            }
+        }
+        else if( i_suffix_length == 0 && i_level_code < 30 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, 4, i_level_code - 14 );
+        }
+        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, i_suffix_length, i_level_code );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_level_prefix[15] );
+            i_level_code -= 15 << i_suffix_length;
+            if( i_suffix_length == 0 )
+            {
+                i_level_code -= 15;
+            }
+
+            if( i_level_code >= ( 1 << 12 ) || i_level_code < 0 )
+            {
+                fprintf( stderr, "OVERFLOW levelcode=%d\n", i_level_code );
+            }
+
+            bs_write( s, 12, i_level_code );    /* check overflow ?? */
+        }
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        }
+    }
+
+    for( i = 0, i_zero_left = i_total_zero; i < i_total - 1; i++ )
+    {
+        int i_zl;
+
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+
+        i_zl = X264_MIN( i_zero_left - 1, 6 );
+
+        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
+
+        i_zero_left -= run[i];
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_write:
+ *****************************************************************************/
+void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
+{
+    const int i_mb_type = h->mb.i_type;
+    int i_mb_i_offset;
+    int i;
+
+    switch( h->sh.i_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_i_offset = 23;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return;
+    }
+
+    /* Write:
+      - type
+      - prediction
+      - mv */
+    if( i_mb_type == I_PCM )
+    {
+        /* Untested */
+        bs_write_ue( s, i_mb_i_offset + 25 );
+
+        bs_align_0( s );
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            const int x = 16 * h->mb.i_mb_x + (i % 16);
+            const int y = 16 * h->mb.i_mb_y + (i / 16);
+            bs_write( s, 8, h->fenc->plane[0][y*h->mb.pic.i_stride[0]+x] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[1][y*h->mb.pic.i_stride[1]+x] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[2][y*h->mb.pic.i_stride[2]+x] );
+        }
+        return;
+    }
+    else if( i_mb_type == I_4x4 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 0 );
+
+        /* Prediction: Luma */
+        for( i = 0; i < 16; i++ )
+        {
+            int i_pred = x264_mb_predict_intra4x4_mode( h, i );
+            int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+
+            if( i_pred == i_mode)
+            {
+                bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
+            }
+            else
+            {
+                bs_write1( s, 0 );  /* b_prev_intra4x4_pred_mode */
+                if( i_mode < i_pred )
+                {
+                    bs_write( s, 3, i_mode );
+                }
+                else
+                {
+                    bs_write( s, 3, i_mode - 1 );
+                }
+            }
+        }
+        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+    }
+    else if( i_mb_type == I_16x16 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 1 + h->mb.i_intra16x16_pred_mode +
+                        h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
+        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+    }
+    else if( i_mb_type == P_L0 )
+    {
+        int mvp[2];
+
+        if( h->mb.i_partition == D_16x16 )
+        {
+            bs_write_ue( s, 0 );
+
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            }
+            x264_mb_predict_mv( h, 0, 0, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            bs_write_ue( s, 1 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            }
+
+            x264_mb_predict_mv( h, 0, 0, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+
+            x264_mb_predict_mv( h, 0, 8, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][1] - mvp[1] );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            bs_write_ue( s, 2 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            }
+
+            x264_mb_predict_mv( h, 0, 0, 2, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+
+            x264_mb_predict_mv( h, 0, 4, 2, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][1] - mvp[1] );
+        }
+    }
+    else if( i_mb_type == P_8x8 )
+    {
+        int b_sub_ref0;
+
+        if( h->mb.cache.ref[0][x264_scan8[0]] == 0 && h->mb.cache.ref[0][x264_scan8[4]] == 0 &&
+            h->mb.cache.ref[0][x264_scan8[8]] == 0 && h->mb.cache.ref[0][x264_scan8[12]] == 0 )
+        {
+            bs_write_ue( s, 4 );
+            b_sub_ref0 = 0;
+        }
+        else
+        {
+            bs_write_ue( s, 3 );
+            b_sub_ref0 = 1;
+        }
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    bs_write_ue( s, 0 );
+                    break;
+                case D_L0_8x4:
+                    bs_write_ue( s, 1 );
+                    break;
+                case D_L0_4x8:
+                    bs_write_ue( s, 2 );
+                    break;
+                case D_L0_4x4:
+                    bs_write_ue( s, 3 );
+                    break;
+            }
+        }
+        /* ref0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
+        {
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[12]] );
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int mvp[2];
+
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    x264_mb_predict_mv( h, 0, 4*i, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+                    break;
+                case D_L0_8x4:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+2, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][1] - mvp[1] );
+                    break;
+                case D_L0_4x8:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+1, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][1] - mvp[1] );
+                    break;
+                case D_L0_4x4:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+1, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+2, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+3, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+3]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+3]][1] - mvp[1] );
+                    break;
+            }
+        }
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        fprintf( stderr, "invalid/unhandled mb_type (B_8x8)\n" );
+        return;
+    }
+    else if( i_mb_type != B_DIRECT )
+    {
+        /* All B mode */
+        /* Motion Vector */
+        int i_list;
+        int mvp[2];
+
+        int b_list[2][2];
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i];
+            b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
+        }
+
+
+        if( h->mb.i_partition == D_16x16 )
+        {
+            if( b_list[0][0] && b_list[1][0] )
+            {
+                bs_write_ue( s, 3 );
+            }
+            else if( b_list[1][0] )
+            {
+                bs_write_ue( s, 2 );
+            }
+            else
+            {
+                bs_write_ue( s, 1 );
+            }
+        }
+        else
+        {
+            if( i_mb_type == B_BI_BI )
+            {
+                bs_write_ue( s, 20 + (h->mb.i_partition == D_16x8 ? 0 : 1 ) );
+            }
+            else if( b_list[0][0] && b_list[1][0] )
+            {
+                /* B_BI_LX* */
+                bs_write_ue( s, 16 + (b_list[0][1]?0:2) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[0][1] && b_list[1][1] )
+            {
+                /* B_LX_BI */
+                bs_write_ue( s, 12 + (b_list[0][1]?0:2) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[1][1] )
+            {
+                /* B_LX_L1 */
+                bs_write_ue( s, 6 + (b_list[0][0]?2:0) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[0][1] )
+            {
+                /* B_LX_L0 */
+                bs_write_ue( s, 4 + (b_list[0][0]?0:6) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+        }
+
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            const int i_ref_max = i_list == 0 ? h->sh.i_num_ref_idx_l0_active : h->sh.i_num_ref_idx_l1_active;
+
+            if( i_ref_max > 1 )
+            {
+                switch( h->mb.i_partition )
+                {
+                    case D_16x16:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        break;
+                    case D_16x8:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[8]] );
+                        break;
+                    case D_8x16:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[4]] );
+                        break;
+                }
+            }
+        }
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    break;
+                case D_16x8:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    if( b_list[i_list][1] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 8, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][1] - mvp[1] );
+                    }
+                    break;
+                case D_8x16:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 2, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    if( b_list[i_list][1] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 4, 2, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][1] - mvp[1] );
+                    }
+                    break;
+            }
+        }
+    }
+    else if( i_mb_type == B_DIRECT )
+    {
+        bs_write_ue( s, 0 );
+    }
+    else
+    {
+        fprintf( stderr, "invalid/unhandled mb_type\n" );
+        return;
+    }
+
+    /* Coded block patern */
+    if( i_mb_type == I_4x4 )
+    {
+        bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    }
+    else if( i_mb_type != I_16x16 )
+    {
+        bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    }
+
+    /* write residual */
+    if( i_mb_type == I_16x16 )
+    {
+        bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
+
+        /* DC Luma */
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 );
+
+        if( h->mb.i_cbp_luma != 0 )
+        {
+            /* AC Luma */
+            for( i = 0; i < 16; i++ )
+            {
+                block_residual_write_cavlc( h, s, i, h->dct.block[i].residual_ac, 15 );
+            }
+        }
+    }
+    else if( h->mb.i_cbp_luma != 0 || h->mb.i_cbp_chroma != 0 )
+    {
+        bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
+
+        for( i = 0; i < 16; i++ )
+        {
+            if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
+            {
+                block_residual_write_cavlc( h, s, i, h->dct.block[i].luma4x4, 16 );
+            }
+        }
+    }
+    if( h->mb.i_cbp_chroma != 0 )
+    {
+        /* Chroma DC residual present */
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
+        if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cavlc( h, s, 16 + i, h->dct.block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
diff --git a/encoder/encoder.c b/encoder/encoder.c
new file mode 100644
index 00000000..73ca380b
--- /dev/null
+++ b/encoder/encoder.c
@@ -0,0 +1,1235 @@
+/*****************************************************************************
+ * x264: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: encoder.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <math.h>
+
+#include "../core/common.h"
+#include "../core/cpu.h"
+
+#include "set.h"
+#include "analyse.h"
+#include "ratecontrol.h"
+#include "macroblock.h"
+
+//#define DEBUG_MB_TYPE
+//#define DEBUG_DUMP_FRAME 1
+
+static int64_t i_mtime_encode_frame = 0;
+
+static int64_t i_mtime_analyse = 0;
+static int64_t i_mtime_encode = 0;
+static int64_t i_mtime_write = 0;
+static int64_t i_mtime_filter = 0;
+#define TIMER_START( d ) \
+    { \
+        int64_t d##start = x264_mdate();
+
+#define TIMER_STOP( d ) \
+        d += x264_mdate() - d##start;\
+    }
+
+
+/****************************************************************************
+ *
+ ******************************* x264 libs **********************************
+ *
+ ****************************************************************************/
+
+
+static float x264_psnr( uint8_t *pix1, int i_pix_stride, uint8_t *pix2, int i_pix2_stride, int i_width, int i_height )
+{
+    int64_t i_sqe = 0;
+
+    int x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            int tmp;
+
+            tmp = pix1[y*i_pix_stride+x] - pix2[y*i_pix2_stride+x];
+
+            i_sqe += tmp * tmp;
+        }
+    }
+
+    if( i_sqe == 0 )
+    {
+        return -1.0;
+    }
+    return (float)(10.0 * log( (double)65025.0 * (double)i_height * (double)i_width / (double)i_sqe ) / log( 10.0 ));
+}
+
+#if DEBUG_DUMP_FRAME
+static void x264_frame_dump( x264_t *h, x264_frame_t *fr, char *name )
+{
+    FILE * f = fopen( name, "a" );
+    int i, y;
+
+    fseek( f, 0, SEEK_END );
+
+    for( i = 0; i < fr->i_plane; i++ )
+    {
+        for( y = 0; y < h->param.i_height / ( i == 0 ? 1 : 2 ); y++ )
+        {
+            fwrite( &fr->plane[i][y*fr->i_stride[i]], 1, h->param.i_width / ( i == 0 ? 1 : 2 ), f );
+        }
+    }
+    fclose( f );
+}
+#endif
+
+
+/* Fill "default" values */
+static void x264_slice_header_init( x264_slice_header_t *sh, x264_param_t *param,
+                                    x264_sps_t *sps, x264_pps_t *pps,
+                                    int i_type, int i_idr_pic_id, int i_frame )
+{
+    /* First we fill all field */
+    sh->sps = sps;
+    sh->pps = pps;
+
+    sh->i_type      = i_type;
+    sh->i_first_mb  = 0;
+    sh->i_pps_id    = pps->i_id;
+
+    sh->i_frame_num = i_frame;
+
+    sh->b_field_pic = 0;    /* Not field support for now */
+    sh->b_bottom_field = 1; /* not yet used */
+
+    sh->i_idr_pic_id = i_idr_pic_id;
+
+    /* poc stuff, fixed later */
+    sh->i_poc_lsb = 0;
+    sh->i_delta_poc_bottom = 0;
+    sh->i_delta_poc[0] = 0;
+    sh->i_delta_poc[1] = 0;
+
+    sh->i_redundant_pic_cnt = 0;
+
+    sh->b_direct_spatial_mv_pred = 1;
+
+    sh->b_num_ref_idx_override = 0;
+    sh->i_num_ref_idx_l0_active = 1;
+    sh->i_num_ref_idx_l1_active = 1;
+
+    sh->i_cabac_init_idc = param->i_cabac_init_idc;
+
+    sh->i_qp_delta = 0;
+    sh->b_sp_for_swidth = 0;
+    sh->i_qs_delta = 0;
+
+    if( param->b_deblocking_filter )
+    {
+        sh->i_disable_deblocking_filter_idc = 0;
+    }
+    else
+    {
+        sh->i_disable_deblocking_filter_idc = 1;
+    }
+    sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
+    sh->i_beta_offset = param->i_deblocking_filter_beta << 1;
+}
+
+static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal_ref_idc )
+{
+    bs_write_ue( s, sh->i_first_mb );
+    bs_write_ue( s, sh->i_type + 5 );   /* same type things */
+    bs_write_ue( s, sh->i_pps_id );
+    bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num );
+
+    if( sh->i_idr_pic_id >= 0 ) /* NAL IDR */
+    {
+        bs_write_ue( s, sh->i_idr_pic_id );
+    }
+
+    if( sh->sps->i_poc_type == 0 )
+    {
+        bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc_lsb );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            bs_write_se( s, sh->i_delta_poc_bottom );
+        }
+    }
+    else if( sh->sps->i_poc_type == 1 && !sh->sps->b_delta_pic_order_always_zero )
+    {
+        bs_write_se( s, sh->i_delta_poc[0] );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            bs_write_se( s, sh->i_delta_poc[1] );
+        }
+    }
+
+    if( sh->pps->b_redundant_pic_cnt )
+    {
+        bs_write_ue( s, sh->i_redundant_pic_cnt );
+    }
+
+    if( sh->i_type == SLICE_TYPE_B )
+    {
+        bs_write1( s, sh->b_direct_spatial_mv_pred );
+    }
+    if( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP || sh->i_type == SLICE_TYPE_B )
+    {
+        bs_write1( s, sh->b_num_ref_idx_override );
+        if( sh->b_num_ref_idx_override )
+        {
+            bs_write_ue( s, sh->i_num_ref_idx_l0_active - 1 );
+            if( sh->i_type == SLICE_TYPE_B )
+            {
+                bs_write_ue( s, sh->i_num_ref_idx_l1_active - 1 );
+            }
+        }
+    }
+
+    /* ref pic list reordering */
+    if( sh->i_type != SLICE_TYPE_I )
+    {
+        int b_ref_pic_list_reordering_l0 = 0;
+        bs_write1( s, b_ref_pic_list_reordering_l0 );
+        if( b_ref_pic_list_reordering_l0 )
+        {
+            /* FIXME */
+        }
+    }
+    if( sh->i_type == SLICE_TYPE_B )
+    {
+        int b_ref_pic_list_reordering_l1 = 0;
+        bs_write1( s, b_ref_pic_list_reordering_l1 );
+        if( b_ref_pic_list_reordering_l1 )
+        {
+            /* FIXME */
+        }
+    }
+
+    if( ( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) ) ||
+        ( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) )
+    {
+        /* FIXME */
+    }
+
+    if( i_nal_ref_idc != 0 )
+    {
+        if( sh->i_idr_pic_id >= 0 )
+        {
+            bs_write1( s, 0 );  /* no output of prior pics flag */
+            bs_write1( s, 0 );  /* long term reference flag */
+        }
+        else
+        {
+            bs_write1( s, 0 );  /* adaptive_ref_pic_marking_mode_flag */
+            /* FIXME */
+        }
+    }
+
+    if( sh->pps->b_cabac && sh->i_type != SLICE_TYPE_I )
+    {
+        bs_write_ue( s, sh->i_cabac_init_idc );
+    }
+    bs_write_se( s, sh->i_qp_delta );      /* slice qp delta */
+#if 0
+    if( sh->i_type == SLICE_TYPE_SP || sh->i_type == SLICE_TYPE_SI )
+    {
+        if( sh->i_type == SLICE_TYPE_SP )
+        {
+            bs_write1( s, sh->b_sp_for_swidth );
+        }
+        bs_write_se( s, sh->i_qs_delta );
+    }
+#endif
+
+    if( sh->pps->b_deblocking_filter_control )
+    {
+        bs_write_ue( s, sh->i_disable_deblocking_filter_idc );
+        if( sh->i_disable_deblocking_filter_idc != 1 )
+        {
+            bs_write_se( s, sh->i_alpha_c0_offset >> 1 );
+            bs_write_se( s, sh->i_beta_offset >> 1 );
+        }
+    }
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************
+ ****************************** External API*********************************
+ ****************************************************************************
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * x264_encoder_open:
+ ****************************************************************************/
+x264_t *x264_encoder_open   ( x264_param_t *param )
+{
+    x264_t *h = x264_malloc( sizeof( x264_t ) );
+    int i;
+
+    /* Check parameters validity */
+    if( param->i_width <= 0  || param->i_height <= 0 )
+    {
+        fprintf( stderr, "invalid width x height (%dx%d)\n",
+                 param->i_width, param->i_height );
+        free( h );
+        return NULL;
+    }
+
+    if( param->i_width % 16 != 0 || param->i_height % 16 != 0 )
+    {
+        fprintf( stderr, "width %% 16 != 0 pr height %% 16 != 0 (%dx%d)\n",
+                 param->i_width, param->i_height );
+        free( h );
+        return NULL;
+    }
+    if( param->i_csp != X264_CSP_I420 )
+    {
+        fprintf( stderr, "invalid CSP (only I420 supported)\n" );
+        free( h );
+        return NULL;
+    }
+
+    /* Fix parameters values */
+    memcpy( &h->param, param, sizeof( x264_param_t ) );
+    h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 15 );
+    if( h->param.i_idrframe <= 0 )
+    {
+        h->param.i_idrframe = 1;
+    }
+    if( h->param.i_iframe <= 0 )
+    {
+        h->param.i_iframe = 1;
+    }
+    h->param.i_bframe  = x264_clip3( h->param.i_bframe , 0, X264_BFRAME_MAX );
+#if 0
+    if( h->param.i_bframe > 0 && h->param.b_cabac )
+    {
+        fprintf( stderr, "cabac not supported with B frame (cabac disabled)\n" );
+        h->param.b_cabac = 0;
+    }
+#endif
+
+    h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 );
+    h->param.i_deblocking_filter_beta    = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
+
+    h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, -1, 2 );
+
+    /* Init x264_t */
+    h->out.i_nal = 0;
+    h->out.i_bitstream = 1000000; /* FIXME estimate max size (idth/height) */
+    h->out.p_bitstream = x264_malloc( h->out.i_bitstream );
+
+    h->i_frame = 0;
+    h->i_frame_num = 0;
+    h->i_poc   = 0;
+    h->i_idr_pic_id = 0;
+
+    h->sps = &h->sps_array[0];
+    x264_sps_init( h->sps, 0, &h->param );
+
+    h->pps = &h->pps_array[0];
+    x264_pps_init( h->pps, 0, &h->param, h->sps);
+
+    /* Init frames. */
+    for( i = 0; i < X264_BFRAME_MAX + 1; i++ )
+    {
+        h->frames.current[i] = NULL;
+        h->frames.next[i]    = NULL;
+        h->frames.unused[i]  = NULL;
+    }
+    for( i = 0; i < 1 + h->param.i_bframe; i++ )
+    {
+        h->frames.unused[i] =  x264_frame_new( h );
+    }
+    for( i = 0; i < 2 + h->param.i_frame_reference; i++ )
+    {
+        /* 2 = 1 backward ref  + 1 fdec */
+        h->frames.reference[i] = x264_frame_new( h );
+    }
+    h->frames.i_last_idr = h->param.i_idrframe;
+    h->frames.i_last_i   = h->param.i_iframe;
+
+    h->i_ref0 = 0;
+    h->i_ref1 = 0;
+
+    h->fdec = h->frames.reference[0];
+
+    /* init mb cache */
+    x264_macroblock_cache_init( h );
+
+    /* init cabac adaptive model */
+    x264_cabac_model_init( &h->cabac );
+
+    /* init CPU functions */
+    x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
+    x264_predict_8x8_init( h->param.cpu, h->predict_8x8 );
+    x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
+
+    x264_pixel_init( h->param.cpu, &h->pixf );
+    x264_dct_init( h->param.cpu, &h->dctf );
+    x264_mc_init( h->param.cpu, h->mc );
+    x264_csp_init( h->param.cpu, h->param.i_csp, &h->csp );
+
+    /* rate control */
+    h->rc = x264_ratecontrol_new( &h->param );
+
+    /* stat */
+    h->stat.i_slice_count[SLICE_TYPE_I] = 0;
+    h->stat.i_slice_count[SLICE_TYPE_P] = 0;
+    h->stat.i_slice_count[SLICE_TYPE_B] = 0;
+    h->stat.i_slice_size[SLICE_TYPE_I] = 0;
+    h->stat.i_slice_size[SLICE_TYPE_P] = 0;
+    h->stat.i_slice_size[SLICE_TYPE_B] = 0;
+
+    h->stat.f_psnr_y[SLICE_TYPE_I] = 0.0; h->stat.f_psnr_u[SLICE_TYPE_I] = 0.0; h->stat.f_psnr_v[SLICE_TYPE_I] = 0.0;
+    h->stat.f_psnr_y[SLICE_TYPE_P] = 0.0; h->stat.f_psnr_u[SLICE_TYPE_P] = 0.0; h->stat.f_psnr_v[SLICE_TYPE_P] = 0.0;
+    h->stat.f_psnr_y[SLICE_TYPE_B] = 0.0; h->stat.f_psnr_u[SLICE_TYPE_B] = 0.0; h->stat.f_psnr_v[SLICE_TYPE_B] = 0.0;
+
+    for( i = 0; i < 17; i++ )
+    {
+        h->stat.i_mb_count[SLICE_TYPE_I][i] = 0;
+        h->stat.i_mb_count[SLICE_TYPE_P][i] = 0;
+        h->stat.i_mb_count[SLICE_TYPE_B][i] = 0;
+    }
+    return h;
+}
+
+/* internal usage */
+static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
+{
+    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
+
+    nal->i_ref_idc = i_ref_idc;
+    nal->i_type    = i_type;
+
+    bs_align_0( &h->out.bs );   /* not needed */
+
+    nal->i_payload= 0;
+    nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs) / 8];
+}
+static void x264_nal_end( x264_t *h )
+{
+    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
+
+    bs_align_0( &h->out.bs );   /* not needed */
+
+    nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs)/8] - nal->p_payload;
+
+    h->out.i_nal++;
+}
+
+/****************************************************************************
+ * x264_encoder_headers:
+ ****************************************************************************/
+int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
+{
+    /* init bitstream context */
+    h->out.i_nal = 0;
+    bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+
+    /* Put SPS and PPS */
+    if( h->i_frame == 0 )
+    {
+        /* generate sequence parameters */
+        x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+        x264_sps_write( &h->out.bs, h->sps );
+        x264_nal_end( h );
+
+        /* generate picture parameters */
+        x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+        x264_pps_write( &h->out.bs, h->pps );
+        x264_nal_end( h );
+    }
+    /* now set output*/
+    *pi_nal = h->out.i_nal;
+    *pp_nal = &h->out.nal[0];
+
+    return 0;
+}
+
+
+static void x264_frame_put( x264_frame_t *list[X264_BFRAME_MAX], x264_frame_t *frame )
+{
+    int i = 0;
+
+    while( list[i] != NULL ) i++;
+
+    list[i] = frame;
+}
+
+static x264_frame_t *x264_frame_get( x264_frame_t *list[X264_BFRAME_MAX+1] )
+{
+    x264_frame_t *frame = list[0];
+    int i;
+
+    for( i = 0; i < X264_BFRAME_MAX; i++ )
+    {
+        list[i] = list[i+1];
+    }
+    list[X264_BFRAME_MAX] = NULL;
+
+    return frame;
+}
+
+static inline void x264_reference_build_list( x264_t *h, int i_poc )
+{
+    int i;
+    int b_ok;
+
+    /* build ref list 0/1 */
+    h->i_ref0 = 0;
+    h->i_ref1 = 0;
+    for( i = 1; i < h->param.i_frame_reference+2; i++ )
+    {
+        if( h->frames.reference[i]->i_poc >= 0 )
+        {
+            if( h->frames.reference[i]->i_poc < i_poc )
+            {
+                h->fref0[h->i_ref0++] = h->frames.reference[i];
+            }
+            else if( h->frames.reference[i]->i_poc > i_poc )
+            {
+                h->fref1[h->i_ref1++] = h->frames.reference[i];
+            }
+        }
+    }
+    /* Order ref0 from higher to lower poc */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref0 - 1; i++ )
+        {
+            if( h->fref0[i]->i_poc < h->fref0[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref0[i+1];
+
+                h->fref0[i+1] = h->fref0[i];
+                h->fref0[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+    /* Order ref1 from lower to higher poc (bubble sort) for B-frame */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref1 - 1; i++ )
+        {
+            if( h->fref1[i]->i_poc > h->fref1[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref1[i+1];
+
+                h->fref1[i+1] = h->fref1[i];
+                h->fref1[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+
+    if( h->i_ref0 > h->param.i_frame_reference )
+    {
+        h->i_ref0 = h->param.i_frame_reference;
+    }
+    if( h->i_ref1 > 1 )
+    {
+        h->i_ref1 = 1;
+    }
+}
+
+static inline void x264_reference_update( x264_t *h )
+{
+    int i;
+
+    /* apply deblocking filter to the current decoded picture */
+    if( h->param.b_deblocking_filter )
+    {
+        TIMER_START( i_mtime_filter );
+        x264_frame_deblocking_filter( h, h->sh.i_type );
+        TIMER_STOP( i_mtime_filter );
+    }
+    /* expand border */
+    x264_frame_expand_border( h->fdec );
+
+    /* move frame in the buffer */
+    h->fdec = h->frames.reference[h->param.i_frame_reference+1];
+    for( i = h->param.i_frame_reference+1; i > 0; i-- )
+    {
+        h->frames.reference[i] = h->frames.reference[i-1];
+    }
+    h->frames.reference[0] = h->fdec;
+}
+
+static inline void x264_reference_reset( x264_t *h )
+{
+    int i;
+
+    /* reset ref pictures */
+    for( i = 1; i < h->param.i_frame_reference+2; i++ )
+    {
+        h->frames.reference[i]->i_poc = -1;
+    }
+    h->frames.reference[0]->i_poc = 0;
+}
+
+static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_slice_type, int i_global_qp )
+{
+    /* ------------------------ Create slice header  ----------------------- */
+    if( i_nal_type == NAL_SLICE_IDR )
+    {
+        x264_slice_header_init( &h->sh, &h->param, h->sps, h->pps, i_slice_type, h->i_idr_pic_id, h->i_frame_num - 1 );
+
+        /* increment id */
+        h->i_idr_pic_id = ( h->i_idr_pic_id + 1 ) % 65535;
+    }
+    else
+    {
+        x264_slice_header_init( &h->sh, &h->param, h->sps, h->pps, i_slice_type, -1, h->i_frame_num - 1 );
+
+        /* always set the real higher num of ref frame used */
+        h->sh.b_num_ref_idx_override = 1;
+        h->sh.i_num_ref_idx_l0_active = h->i_ref0 <= 0 ? 1 : h->i_ref0;
+        h->sh.i_num_ref_idx_l1_active = h->i_ref1 <= 0 ? 1 : h->i_ref1;
+    }
+
+    if( h->sps->i_poc_type == 0 )
+    {
+        h->sh.i_poc_lsb = h->fdec->i_poc & ( (1 << h->sps->i_log2_max_poc_lsb) - 1 );
+        h->sh.i_delta_poc_bottom = 0;   /* XXX won't work for field */
+    }
+    else if( h->sps->i_poc_type == 1 )
+    {
+        /* FIXME TODO FIXME */
+    }
+    else
+    {
+        /* Nothing to do ? */
+    }
+
+    /* global qp */
+    h->sh.i_qp_delta = i_global_qp - h->pps->i_pic_init_qp;
+
+    /* get adapative cabac model if needed */
+    if( h->param.b_cabac )
+    {
+        if( h->param.i_cabac_init_idc == -1 )
+        {
+            h->sh.i_cabac_init_idc = x264_cabac_model_get( &h->cabac, i_slice_type );
+        }
+    }
+}
+
+static inline void x264_slice_write( x264_t *h, int i_nal_type, int i_nal_ref_idc, int i_mb_count[18] )
+{
+    int i_skip;
+    int mb_xy;
+    int i;
+
+    /* Init stats */
+    for( i = 0; i < 17; i++ ) i_mb_count[i] = 0;
+
+    /* Slice */
+    x264_nal_start( h, i_nal_type, i_nal_ref_idc );
+
+    /* Slice header */
+    x264_slice_header_write( &h->out.bs, &h->sh, i_nal_ref_idc );
+    if( h->param.b_cabac )
+    {
+        /* alignement needed */
+        bs_align_1( &h->out.bs );
+
+        /* init cabac */
+        x264_cabac_context_init( &h->cabac, h->sh.i_type, h->sh.pps->i_pic_init_qp + h->sh.i_qp_delta, h->sh.i_cabac_init_idc );
+        x264_cabac_encode_init ( &h->cabac, &h->out.bs );
+    }
+    h->mb.i_last_qp = h->pps->i_pic_init_qp + h->sh.i_qp_delta;
+    h->mb.i_last_dqp = 0;
+
+    for( mb_xy = 0, i_skip = 0; mb_xy < h->sps->i_mb_width * h->sps->i_mb_height; mb_xy++ )
+    {
+        const int i_mb_y = mb_xy / h->sps->i_mb_width;
+        const int i_mb_x = mb_xy % h->sps->i_mb_width;
+
+        /* load cache */
+        x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
+
+        /* analyse parameters
+         * Slice I: choose I_4x4 or I_16x16 mode
+         * Slice P: choose between using P mode or intra (4x4 or 16x16)
+         * */
+        TIMER_START( i_mtime_analyse );
+        x264_macroblock_analyse( h );
+        TIMER_STOP( i_mtime_analyse );
+
+        /* encode this macrobock -> be carefull it can change the mb type to P_SKIP if needed */
+        TIMER_START( i_mtime_encode );
+        x264_macroblock_encode( h );
+        TIMER_STOP( i_mtime_encode );
+
+        TIMER_START( i_mtime_write );
+        if( IS_SKIP( h->mb.i_type ) )
+        {
+            if( h->param.b_cabac )
+            {
+                if( mb_xy > 0 )
+                {
+                    /* not end_of_slice_flag */
+                    x264_cabac_encode_terminal( &h->cabac, 0 );
+                }
+
+                x264_cabac_mb_skip( h, 1 );
+            }
+            else
+            {
+                i_skip++;
+            }
+        }
+        else
+        {
+            if( h->param.b_cabac )
+            {
+                if( mb_xy > 0 )
+                {
+                    /* not end_of_slice_flag */
+                    x264_cabac_encode_terminal( &h->cabac, 0 );
+                }
+                if( h->sh.i_type != SLICE_TYPE_I )
+                {
+                    x264_cabac_mb_skip( h, 0 );
+                }
+                x264_macroblock_write_cabac( h, &h->out.bs );
+            }
+            else
+            {
+                if( h->sh.i_type != SLICE_TYPE_I )
+                {
+                    bs_write_ue( &h->out.bs, i_skip );  /* skip run */
+                    i_skip = 0;
+                }
+                x264_macroblock_write_cavlc( h, &h->out.bs );
+            }
+        }
+        TIMER_STOP( i_mtime_write );
+
+        /* save cache */
+        x264_macroblock_cache_save( h );
+
+        i_mb_count[h->mb.i_type]++;
+    }
+
+    if( h->param.b_cabac )
+    {
+        /* end of slice */
+        x264_cabac_encode_terminal( &h->cabac, 1 );
+    }
+    else if( i_skip > 0 )
+    {
+        bs_write_ue( &h->out.bs, i_skip );  /* last skip run */
+    }
+
+    if( h->param.b_cabac )
+    {
+        int i_cabac_word;
+        x264_cabac_encode_flush( &h->cabac );
+        /* TODO cabac stuffing things (p209) */
+        i_cabac_word = (((3 * h->cabac.i_sym_cnt - 3 * 96 * h->sps->i_mb_width * h->sps->i_mb_height)/32) - bs_pos( &h->out.bs)/8)/3;
+
+        while( i_cabac_word > 0 )
+        {
+            bs_write( &h->out.bs, 16, 0x0000 );
+            i_cabac_word--;
+        }
+    }
+    else
+    {
+        /* rbsp_slice_trailing_bits */
+        bs_rbsp_trailing( &h->out.bs );
+    }
+
+    x264_nal_end( h );
+}
+
+/****************************************************************************
+ * x264_encoder_encode:
+ *  XXX: i_poc   : is the poc of the current given picture
+ *       i_frame : is the number of the frame being coded
+ *  ex:  type frame poc
+ *       I      0   2*0
+ *       P      1   2*3
+ *       B      2   2*1
+ *       B      3   2*2
+ *       P      4   2*6
+ *       B      5   2*4
+ *       B      6   2*5
+ ****************************************************************************/
+int     x264_encoder_encode( x264_t *h,
+                             x264_nal_t **pp_nal, int *pi_nal,
+                             x264_picture_t *pic )
+{
+    x264_frame_t   *frame_psnr = h->fdec; /* just to kept the current decoded frame for psnr calculation */
+    int     i_nal_type;
+    int     i_nal_ref_idc;
+    int     i_slice_type;
+
+    int i;
+
+    float psnr_y, psnr_u, psnr_v;
+
+    int   i_global_qp;
+
+    int i_mb_count[18];
+
+    /* no data out */
+    *pi_nal = 0;
+    *pp_nal = NULL;
+
+
+    /* ------------------- Setup new frame from picture -------------------- */
+    TIMER_START( i_mtime_encode_frame );
+    if( pic != NULL )
+    {
+        /* Copy the picture to a frame, init the frame and move it to a buffer */
+        /* 1: get a frame */
+        x264_frame_t *fenc = x264_frame_get( h->frames.unused );
+
+        x264_frame_copy_picture( h, fenc, pic );
+
+        /* 2: get its type */
+        if( ( h->frames.i_last_i + 1 >= h->param.i_iframe && h->frames.i_last_idr + 1 >= h->param.i_idrframe ) ||
+            pic->i_type == X264_TYPE_IDR )
+        {
+            /* IDR */
+            fenc->i_type = X264_TYPE_IDR;
+
+            h->i_poc       = 0;
+            h->i_frame_num = 0;
+
+            /* Last schedule B frames need to be encoded as P */
+            if( h->frames.next[0] != NULL )
+            {
+                x264_frame_t *tmp;
+                int i = 0;
+
+                while( h->frames.next[i+1] != NULL ) i++;
+                h->frames.next[i]->i_type = X264_TYPE_P;
+
+                /* remove this P from next */
+                tmp = h->frames.next[i];
+                h->frames.next[i] = NULL;
+
+                /* move this P + Bs to current */
+                x264_frame_put( h->frames.current, tmp );
+                while( ( tmp = x264_frame_get( h->frames.next ) ) )
+                {
+                    x264_frame_put( h->frames.current, tmp );
+                }
+            }
+        }
+        else if( h->param.i_bframe > 0 )
+        {
+            if( h->frames.i_last_i  + 1 >= h->param.i_iframe )
+                fenc->i_type = X264_TYPE_I;
+            else if( h->frames.next[h->param.i_bframe-1] != NULL )
+                fenc->i_type = X264_TYPE_P;
+            else if( pic->i_type == X264_TYPE_AUTO )
+                fenc->i_type = X264_TYPE_B;
+            else
+                fenc->i_type = pic->i_type;
+        }
+        else
+        {
+            if( pic->i_type == X264_TYPE_AUTO )
+            {
+                if( h->frames.i_last_i + 1 >= h->param.i_iframe )
+                    fenc->i_type = X264_TYPE_I;
+                else
+                    fenc->i_type = X264_TYPE_P;
+            }
+            else
+            {
+                fenc->i_type = pic->i_type;
+            }
+        }
+
+        fenc->i_poc = h->i_poc;
+        if( fenc->i_type == X264_TYPE_IDR )
+        {
+            h->frames.i_last_idr = 0;
+            h->frames.i_last_i = 0;
+        }
+        else if( fenc->i_type == X264_TYPE_I )
+        {
+            h->frames.i_last_idr++;
+            h->frames.i_last_i = 0;
+        }
+        else
+        {
+            h->frames.i_last_i++;
+        }
+
+        /* 3: Update current/next */
+        if( fenc->i_type == X264_TYPE_B )
+        {
+            x264_frame_put( h->frames.next, fenc );
+        }
+        else
+        {
+            x264_frame_put( h->frames.current, fenc );
+            while( ( fenc = x264_frame_get( h->frames.next ) ) )
+            {
+                x264_frame_put( h->frames.current, fenc );
+            }
+        }
+        h->i_poc += 2;
+    }
+    else    /* No more picture, begin encoding of last frames */
+    {
+        /* Move all next frames to current and mark the last one as a P */
+        x264_frame_t *tmp;
+        int i = -1;
+        while( h->frames.next[i+1] != NULL ) i++;
+        if( i >= 0 )
+        {
+            h->frames.next[i]->i_type = X264_TYPE_P;
+            tmp = h->frames.next[i];
+            h->frames.next[i] = NULL;
+
+            x264_frame_put( h->frames.current, tmp );
+            while( ( tmp = x264_frame_get( h->frames.next ) ) )
+            {
+                x264_frame_put( h->frames.current, tmp );
+            }
+        }
+    }
+    TIMER_STOP( i_mtime_encode_frame );
+
+    /* ------------------- Get frame to be encoded ------------------------- */
+    /* 4: get picture to encode */
+    h->fenc = x264_frame_get( h->frames.current );
+    if( h->fenc == NULL )
+    {
+        /* Nothing yet to encode (ex: waiting for I/P with B frames) */
+        /* waiting for filling bframe buffer */
+        pic->i_type = X264_TYPE_AUTO;
+        return 0;
+    }
+    x264_frame_put( h->frames.unused, h->fenc );  /* Safe to do it now, we don't use frames.unused for the rest */
+
+    /* ------------------- Setup frame context ----------------------------- */
+    /* 5: Init data dependant of frame type */
+    TIMER_START( i_mtime_encode_frame );
+    if( h->fenc->i_type == X264_TYPE_IDR )
+    {
+        /* reset ref pictures */
+        x264_reference_reset( h );
+
+        i_nal_type    = NAL_SLICE_IDR;
+        i_nal_ref_idc = NAL_PRIORITY_HIGHEST;
+        i_slice_type = SLICE_TYPE_I;
+    }
+    else if( h->fenc->i_type == X264_TYPE_I )
+    {
+        i_nal_type    = NAL_SLICE;
+        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
+        i_slice_type = SLICE_TYPE_I;
+    }
+    else if( h->fenc->i_type == X264_TYPE_P )
+    {
+        i_nal_type    = NAL_SLICE;
+        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
+        i_slice_type = SLICE_TYPE_P;
+    }
+    else    /* B frame */
+    {
+        i_nal_type    = NAL_SLICE;
+        i_nal_ref_idc = NAL_PRIORITY_DISPOSABLE;
+        i_slice_type = SLICE_TYPE_B;
+    }
+
+    pic->i_type     =
+    h->fdec->i_type = h->fenc->i_type;
+    h->fdec->i_poc  = h->fenc->i_poc;
+
+
+
+    /* ------------------- Init                ----------------------------- */
+    /* Init the rate control */
+    x264_ratecontrol_start( h->rc, i_slice_type );
+    i_global_qp = x264_ratecontrol_qp( h->rc );
+    if( h->fenc->i_qpplus1 > 0 )
+    {
+        i_global_qp = x264_clip3( h->fenc->i_qpplus1 - 1, 0, 51 );
+    }
+
+    /* build ref list 0/1 */
+    x264_reference_build_list( h, h->fdec->i_poc );
+
+    /* increase frame num but only once for B frame */
+    if( i_slice_type != SLICE_TYPE_B || h->sh.i_type != SLICE_TYPE_B )
+    {
+        h->i_frame_num++;
+    }
+
+    /* ------------------------ Create slice header  ----------------------- */
+    x264_slice_init( h, i_nal_type, i_slice_type, i_global_qp );
+
+    /* ---------------------- Write the bitstream -------------------------- */
+    /* Init bitstream context */
+    h->out.i_nal = 0;
+    bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+
+    /* Write SPS and PPS */
+    if( i_nal_type == NAL_SLICE_IDR )
+    {
+        /* generate sequence parameters */
+        x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+        x264_sps_write( &h->out.bs, h->sps );
+        x264_nal_end( h );
+
+        /* generate picture parameters */
+        x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+        x264_pps_write( &h->out.bs, h->pps );
+        x264_nal_end( h );
+    }
+
+    /* Write the slice */
+    x264_slice_write( h, i_nal_type, i_nal_ref_idc, i_mb_count );
+
+    /* End bitstream, set output  */
+    *pi_nal = h->out.i_nal;
+    *pp_nal = &h->out.nal[0];
+
+    /* Set output picture properties */
+    if( i_slice_type == SLICE_TYPE_I )
+        pic->i_type = i_nal_type == NAL_SLICE_IDR ? X264_TYPE_IDR : X264_TYPE_I;
+    else if( i_slice_type == SLICE_TYPE_P )
+        pic->i_type = X264_TYPE_P;
+    else
+        pic->i_type = X264_TYPE_B;
+    pic->i_pts = h->fenc->i_pts;
+
+    /* ---------------------- Update encoder state ------------------------- */
+    /* update cabac */
+    if( h->param.b_cabac )
+    {
+        x264_cabac_model_update( &h->cabac, i_slice_type, h->sh.pps->i_pic_init_qp + h->sh.i_qp_delta );
+    }
+
+    /* handle references */
+    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
+    {
+        x264_reference_update( h );
+    }
+
+    /* increase frame count */
+    h->i_frame++;
+
+    /* update rc */
+    x264_ratecontrol_end( h->rc, h->out.nal[h->out.i_nal-1].i_payload * 8 );
+
+    /* restore CPU state (before using float again) */
+    x264_cpu_restore( h->param.cpu );
+
+    TIMER_STOP( i_mtime_encode_frame );
+
+    /* ---------------------- Compute/Print statistics --------------------- */
+    /* PSNR */
+    psnr_y = x264_psnr( frame_psnr->plane[0], frame_psnr->i_stride[0], h->fenc->plane[0], h->fenc->i_stride[0], h->param.i_width, h->param.i_height );
+    psnr_u = x264_psnr( frame_psnr->plane[1], frame_psnr->i_stride[1], h->fenc->plane[1], h->fenc->i_stride[1], h->param.i_width/2, h->param.i_height/2);
+    psnr_v = x264_psnr( frame_psnr->plane[2], frame_psnr->i_stride[2], h->fenc->plane[2], h->fenc->i_stride[2], h->param.i_width/2, h->param.i_height/2);
+
+    /* Slice stat */
+    h->stat.i_slice_count[i_slice_type]++;
+    h->stat.i_slice_size[i_slice_type] += bs_pos( &h->out.bs) / 8;
+    h->stat.f_psnr_y[i_slice_type] += psnr_y;
+    h->stat.f_psnr_u[i_slice_type] += psnr_u;
+    h->stat.f_psnr_v[i_slice_type] += psnr_v;
+
+    for( i = 0; i < 17; i++ )
+    {
+        h->stat.i_mb_count[h->sh.i_type][i] += i_mb_count[i];
+    }
+
+    /* print stat */
+    fprintf( stderr, "frame=%4d NAL=%d Slice:%c Poc:%-3d I4x4:%-5d I16x16:%-5d P:%-5d SKIP:%-3d size=%d bytes PSNR Y:%2.2f U:%2.2f V:%2.2f\n",
+             h->i_frame - 1,
+             i_nal_ref_idc,
+             i_slice_type == SLICE_TYPE_I ? 'I' : (i_slice_type == SLICE_TYPE_P ? 'P' : 'B' ),
+             frame_psnr->i_poc,
+             i_mb_count[I_4x4],
+             i_mb_count[I_16x16],
+             i_mb_count[P_L0] + i_mb_count[P_8x8],
+             i_mb_count[P_SKIP],
+             h->out.nal[h->out.i_nal-1].i_payload,
+             psnr_y, psnr_u, psnr_v );
+#ifdef DEBUG_MB_TYPE
+    for( mb_xy = 0; mb_xy < h->sps->i_mb_width * h->sps->i_mb_height; mb_xy++ )
+    {
+        const int i_mb_y = mb_xy / h->sps->i_mb_width;
+        const int i_mb_x = mb_xy % h->sps->i_mb_width;
+
+        if( i_mb_y > 0 && i_mb_x == 0 )
+            fprintf( stderr, "\n" );
+
+        if( h->mb.type[mb_xy] == I_4x4 )
+            fprintf( stderr, "i" );
+        else if( h->mb.type[mb_xy] == I_16x16 )
+            fprintf( stderr, "I" );
+        else if( h->mb.type[mb_xy] == P_SKIP )
+            fprintf( stderr, "S" );
+        else if( h->mb.type[mb_xy] == P_8x8 )
+            fprintf( stderr, "8" );
+        else if( h->mb.type[mb_xy] == P_L0 )
+            fprintf( stderr, "P" );
+        else
+            fprintf( stderr, "?" );
+
+        fprintf( stderr, " " );
+    }
+#endif
+
+#if DEBUG_DUMP_FRAME
+    /* Dump reconstructed frame */
+    x264_frame_dump( h, frame_psnr, "fdec.yuv" );
+#endif
+#if 0
+    if( h->i_ref0 > 0 )
+    {
+        x264_frame_dump( h, h->fref0[0], "ref0.yuv" );
+    }
+    if( h->i_ref1 > 0 )
+    {
+        x264_frame_dump( h, h->fref1[0], "ref1.yuv" );
+    }
+#endif
+    return 0;
+}
+
+/****************************************************************************
+ * x264_encoder_close:
+ ****************************************************************************/
+void    x264_encoder_close  ( x264_t *h )
+{
+    int64_t i_mtime_total = i_mtime_analyse + i_mtime_encode + i_mtime_write + i_mtime_filter + 1;
+    int i;
+
+    fprintf( stderr, "x264: analyse=%d(%lldms) encode=%d(%lldms) write=%d(%lldms) filter=%d(%lldms)\n",
+             (int)(100*i_mtime_analyse/i_mtime_total), i_mtime_analyse/1000,
+             (int)(100*i_mtime_encode/i_mtime_total), i_mtime_encode/1000,
+             (int)(100*i_mtime_write/i_mtime_total), i_mtime_write/1000,
+             (int)(100*i_mtime_filter/i_mtime_total), i_mtime_filter/1000 );
+
+    /* Slices used and PNSR */
+    if( h->stat.i_slice_count[SLICE_TYPE_I] > 0 )
+    {
+        const int i_count = h->stat.i_slice_count[SLICE_TYPE_I];
+        fprintf( stderr, "x264: slice I:%-4d Avg size:%-5d PSNR Y:%2.2f U:%2.2f V:%2.2f PSNR-Y/Size:%2.2f\n",
+                 i_count,
+                 h->stat.i_slice_size[SLICE_TYPE_I] / i_count,
+                 h->stat.f_psnr_y[SLICE_TYPE_I] / i_count,
+                 h->stat.f_psnr_u[SLICE_TYPE_I] / i_count,
+                 h->stat.f_psnr_v[SLICE_TYPE_I] / i_count,
+                 1000*h->stat.f_psnr_y[SLICE_TYPE_I] / h->stat.i_slice_size[SLICE_TYPE_I] );
+    }
+    if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 )
+    {
+        const int i_count = h->stat.i_slice_count[SLICE_TYPE_P];
+        fprintf( stderr, "x264: slice P:%-4d Avg size:%-5d PSNR Y:%2.2f U:%2.2f V:%2.2f PSNR-Y/Size:%2.2f\n",
+                i_count,
+                h->stat.i_slice_size[SLICE_TYPE_P] / i_count,
+                h->stat.f_psnr_y[SLICE_TYPE_P] / i_count,
+                h->stat.f_psnr_u[SLICE_TYPE_P] / i_count,
+                h->stat.f_psnr_v[SLICE_TYPE_P] / i_count,
+                1000.0*h->stat.f_psnr_y[SLICE_TYPE_P] / h->stat.i_slice_size[SLICE_TYPE_P] );
+    }
+    if( h->stat.i_slice_count[SLICE_TYPE_B] > 0 )
+    {
+        fprintf( stderr, "x264: slice B:%-4d Avg size:%-5d PSNR Y:%2.2f U:%2.2f V:%2.2f PSNR-Y/Size:%2.2f\n",
+                h->stat.i_slice_count[SLICE_TYPE_B],
+                h->stat.i_slice_size[SLICE_TYPE_B] / h->stat.i_slice_count[SLICE_TYPE_B],
+                h->stat.f_psnr_y[SLICE_TYPE_B] / h->stat.i_slice_count[SLICE_TYPE_B],
+                h->stat.f_psnr_u[SLICE_TYPE_B] / h->stat.i_slice_count[SLICE_TYPE_B],
+                h->stat.f_psnr_v[SLICE_TYPE_B] / h->stat.i_slice_count[SLICE_TYPE_B],
+                1000*h->stat.f_psnr_y[SLICE_TYPE_B] / h->stat.i_slice_size[SLICE_TYPE_B] );
+    }
+
+    /* MB types used */
+    if( h->stat.i_slice_count[SLICE_TYPE_I] > 0 )
+    {
+        const int i_count =  h->stat.i_slice_count[SLICE_TYPE_I];
+        fprintf( stderr, "x264: slice I      Avg I4x4:%-5d I16x16:%-5d\n",
+                 h->stat.i_mb_count[SLICE_TYPE_I][I_4x4]  / i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_I][I_16x16]/ i_count );
+    }
+    if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 )
+    {
+        const int i_count = h->stat.i_slice_count[SLICE_TYPE_P];
+        fprintf( stderr, "x264: slice P      Avg I4x4:%-5d I16x16:%-5d P:%-5d P8x8:%-5d PSKIP:%-5d\n",
+                 h->stat.i_mb_count[SLICE_TYPE_P][I_4x4]  / i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_P][I_16x16]/ i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_P][P_L0] / i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_P][P_8x8] / i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_P][P_SKIP] /i_count );
+    }
+
+    {
+        const int i_count = h->stat.i_slice_count[SLICE_TYPE_I] +
+                            h->stat.i_slice_count[SLICE_TYPE_P] +
+                            h->stat.i_slice_count[SLICE_TYPE_B];
+
+        fprintf( stderr, "x264: overall PSNR Y:%2.2f U:%2.2f V:%2.2f kb/s:%.1f fps:%.3f\n",
+                 (h->stat.f_psnr_y[SLICE_TYPE_I]+h->stat.f_psnr_y[SLICE_TYPE_P]+h->stat.f_psnr_y[SLICE_TYPE_B]) / i_count,
+                 (h->stat.f_psnr_u[SLICE_TYPE_I]+h->stat.f_psnr_u[SLICE_TYPE_P]+h->stat.f_psnr_u[SLICE_TYPE_B]) / i_count,
+                 (h->stat.f_psnr_v[SLICE_TYPE_I]+h->stat.f_psnr_v[SLICE_TYPE_P]+h->stat.f_psnr_v[SLICE_TYPE_B]) / i_count,
+                 h->param.f_fps * 8*(h->stat.i_slice_size[SLICE_TYPE_I]+h->stat.i_slice_size[SLICE_TYPE_P]+h->stat.i_slice_size[SLICE_TYPE_B]) / i_count / 1024,
+                 (double)1000000.0 * (double)i_count / (double)i_mtime_encode_frame );
+    }
+
+    /* frames */
+    for( i = 0; i < X264_BFRAME_MAX + 1; i++ )
+    {
+        if( h->frames.current[i] ) x264_frame_delete( h->frames.current[i] );
+        if( h->frames.next[i] )    x264_frame_delete( h->frames.next[i] );
+        if( h->frames.unused[i] )  x264_frame_delete( h->frames.unused[i] );
+    }
+    /* ref frames */
+    for( i = 0; i < h->param.i_frame_reference+2; i++ )
+    {
+        x264_frame_delete( h->frames.reference[i] );
+    }
+
+    /* rc */
+    x264_ratecontrol_delete( h->rc );
+
+    x264_macroblock_cache_end( h );
+    x264_free( h->out.p_bitstream );
+    x264_free( h );
+}
+
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
new file mode 100644
index 00000000..353f3d4b
--- /dev/null
+++ b/encoder/macroblock.c
@@ -0,0 +1,859 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "macroblock.h"
+
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int quant_mf[6][4][4] =
+{
+    {  { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243},
+       { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}  },
+    {  { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660},
+       { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}  },
+    {  { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194},
+       { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}  },
+    {  {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647},
+       {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}  },
+    {  {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355},
+       {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}  },
+    {  {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893},
+       {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}  }
+};
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+//static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+//static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[2][0];
+    level[4] = dct[1][1];
+    level[5] = dct[0][2];
+    level[6] = dct[0][3];
+    level[7] = dct[1][2];
+    level[8] = dct[2][1];
+    level[9] = dct[3][0];
+    level[10] = dct[3][1];
+    level[11] = dct[2][2];
+    level[12] = dct[1][3];
+    level[13] = dct[2][3];
+    level[14] = dct[3][2];
+    level[15] = dct[3][3];
+#if 0
+    int i;
+    for( i = 0; i < 16; i++ )
+    {
+        level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+#endif
+}
+static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
+{
+    level[0] = dct[0][1];
+    level[1] = dct[1][0];
+    level[2] = dct[2][0];
+    level[3] = dct[1][1];
+    level[4] = dct[0][2];
+    level[5] = dct[0][3];
+    level[6] = dct[1][2];
+    level[7] = dct[2][1];
+    level[8] = dct[3][0];
+    level[9] = dct[3][1];
+    level[10] = dct[2][2];
+    level[11] = dct[1][3];
+    level[12] = dct[2][3];
+    level[13] = dct[3][2];
+    level[14] = dct[3][3];
+#if 0
+    int i;
+    for( i = 1; i < 16; i++ )
+    {
+        level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+#endif
+}
+
+static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[1][1];
+}
+
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+    const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / 3;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / ( b_intra ? 3 : 6 );
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+#if 0
+/* From a JVT doc */
+static const int f_deadzone_intra[4][4][2] = /* [num][den] */
+{
+    { {1,2}, {3,7}, {2,5}, {1,3} },
+    { {3,7}, {2,5}, {1,3}, {1,4} },
+    { {2,5}, {1,3}, {1,4}, {1,5} },
+    { {1,3}, {1,4}, {1,5}, {1,5} }
+};
+static const int f_deadzone_inter[4][4][2] = /* [num][den] */
+{
+    { {1,3}, {2,7}, {4,15},{2,9} },
+    { {2,7}, {4,15},{2,9}, {1,6} },
+    { {4,15},{2,9}, {1,6}, {1,7} },
+    { {2,9}, {1,6}, {1,7}, {2,15} }
+};
+
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int(*f_deadzone)[4][4][2] = b_intra ? &f_deadzone_intra : &f_deadzone_inter;
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+#if 0
+            const int f = b_intra ?
+                          (f_deadzone_intra[y][x][0] * ( 1 << i_qbits ) / f_deadzone_intra[y][x][1])
+                          :
+                          (f_deadzone_inter[y][x][0] * ( 1 << i_qbits ) / f_deadzone_inter[y][x][1]);
+#else
+            const int f = (*f_deadzone)[y][x][0] * ( 1 << i_qbits ) / (*f_deadzone)[y][x][1];
+#endif
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = b_intra ?
+                   (f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1])
+                   :
+                   (f_deadzone_inter[0][0][0] * ( 2 << i_qbits ) / f_deadzone_inter[0][0][1]);
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+
+#endif
+
+static inline int array_non_zero_count( int *v, int i_count )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < i_count; i++ )
+    {
+        if( v[i] )
+        {
+            i_nz++;
+        }
+    }
+    return i_nz;
+}
+
+/* (ref: JVT-B118)
+ * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
+ * to 0 (low score means set it to null)
+ * Used in inter macroblock (luma and chroma)
+ *  luma: for a 8x8 block: if score < 4 -> null
+ *        for the complete mb: if score < 6 -> null
+ *  chroma: for the complete mb: if score < 7 -> null
+ */
+static int x264_mb_decimate_score( int *dct, int i_max )
+{
+    static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int i_score = 0;
+    int idx = i_max - 1;
+
+    while( idx >= 0 && dct[idx] == 0 )
+    {
+        idx--;
+    }
+
+    while( idx >= 0 )
+    {
+        int i_run;
+
+        if( abs( dct[idx--] ) > 1 )
+        {
+            return 9;
+        }
+
+        i_run = 0;
+        while( idx >= 0 && dct[idx] == 0 )
+        {
+            idx--;
+            i_run++;
+        }
+        i_score += i_ds_table[i_run];
+    }
+
+    return i_score;
+}
+
+void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
+{
+    const int i_stride = h->mb.pic.i_stride[0];
+    uint8_t  *p_src = &h->mb.pic.p_fenc[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride];
+    uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride];
+
+    int16_t dct4x4[4][4];
+
+    h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+
+    quant_4x4( dct4x4, i_qscale, 1 );
+
+    scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
+
+    x264_mb_dequant_4x4( dct4x4, i_qscale );
+
+    /* output samples to fdec */
+    h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
+}
+
+static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
+{
+    const int i_stride = h->mb.pic.i_stride[0];
+    uint8_t  *p_src = h->mb.pic.p_fenc[0];
+    uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+
+    int16_t dct4x4[16+1][4][4];
+
+    int i;
+
+    h->dctf.sub16x16_dct( &dct4x4[1], p_src, i_stride, p_dst, i_stride );
+    for( i = 0; i < 16; i++ )
+    {
+        /* copy dc coeff */
+        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
+
+        /* quant/scan/dequant */
+        quant_4x4( dct4x4[1+i], i_qscale, 1 );
+        scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
+        x264_mb_dequant_4x4( dct4x4[1+i], i_qscale );
+    }
+
+    h->dctf.dct4x4dc( dct4x4[0] );
+    quant_4x4_dc( dct4x4[0], i_qscale );
+    scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
+
+    /* output samples to fdec */
+    h->dctf.idct4x4dc( dct4x4[0] );
+    x264_mb_dequant_4x4_dc( dct4x4[0], i_qscale );  /* XXX not inversed */
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        /* copy dc coeff */
+        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+    }
+    /* put pixels to fdec */
+    h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] );
+}
+
+static void x264_mb_encode_8x8( x264_t *h, int b_inter, int i_qscale )
+{
+    int i, ch;
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        const int i_stride = h->mb.pic.i_stride[1+ch];
+        uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
+        uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
+        int i_decimate_score = 0;
+
+        int16_t dct2x2[2][2];
+        int16_t dct4x4[4][4][4];
+
+        h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            /* copy dc coeff */
+            dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+
+            quant_4x4( dct4x4[i], i_qscale, b_inter ? 0 : 1 );
+            scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
+            x264_mb_dequant_4x4( dct4x4[i], i_qscale );
+
+            if( b_inter )
+            {
+                i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
+            }
+        }
+
+        h->dctf.dct2x2dc( dct2x2 );
+        quant_2x2_dc( dct2x2, i_qscale, b_inter ? 0 : 1 );
+        scan_zigzag_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+
+        /* output samples to fdec */
+        h->dctf.idct2x2dc( dct2x2 );
+        x264_mb_dequant_2x2_dc( dct2x2, i_qscale );  /* XXX not inversed */
+
+        if( b_inter && i_decimate_score < 7 )
+        {
+            /* Near null chroma 8x8 block so make it null (bits saving) */
+            for( i = 0; i < 4; i++ )
+            {
+                int x, y;
+                for( x = 0; x < 15; x++ )
+                {
+                    h->dct.block[16+i+ch*4].residual_ac[x] = 0;
+                }
+                for( x = 0; x < 4; x++ )
+                {
+                    for( y = 0; y < 4; y++ )
+                    {
+                        dct4x4[i][x][y] = 0;
+                    }
+                }
+            }
+        }
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            /* copy dc coeff */
+            dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+        }
+        h->dctf.add8x8_idct( p_dst, i_stride, dct4x4 );
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_encode_pskip:
+ *  Encode an already marked skip block
+ *****************************************************************************/
+void x264_macroblock_encode_pskip( x264_t *h )
+{
+    const int mvx = h->mb.cache.mv[0][x264_scan8[0]][0];
+    const int mvy = h->mb.cache.mv[0][x264_scan8[0]][1];
+    int i;
+
+    /* Motion compensation XXX probably unneeded */
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
+                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
+                    mvx, mvy, 16, 16 );
+
+    /* Chroma MC */
+    h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][1], h->mb.pic.i_stride[1],
+                      h->mb.pic.p_fdec[1],       h->mb.pic.i_stride[1],
+                      mvx, mvy, 8, 8 );
+
+    h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][2], h->mb.pic.i_stride[2],
+                      h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
+                      mvx, mvy, 8, 8 );
+
+    h->mb.i_cbp_luma = 0x00;
+    h->mb.i_cbp_chroma = 0x00;
+
+    for( i = 0; i < 16+8; i++ )
+    {
+        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
+    }
+
+    /* store cbp */
+    h->mb.cbp[h->mb.i_mb_xy] = 0;
+}
+
+/*****************************************************************************
+ * x264_macroblock_encode:
+ *****************************************************************************/
+void x264_macroblock_encode( x264_t *h )
+{
+    int i_cbp_dc = 0;
+    int i_qscale;
+    int i;
+
+    if( h->mb.i_type == P_SKIP )
+    {
+        /* A bit special */
+        x264_macroblock_encode_pskip( h );
+        return;
+    }
+
+    /* quantification scale */
+    i_qscale = h->mb.qp[h->mb.i_mb_xy];
+
+    if( h->mb.i_type == I_16x16 )
+    {
+        const int i_mode = h->mb.i_intra16x16_pred_mode;
+        /* do the right prediction */
+        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+        /* encode the 16x16 macroblock */
+        x264_mb_encode_i16x16( h, i_qscale );
+
+        /* fix the pred mode value */
+        h->mb.i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix[i_mode];
+    }
+    else if( h->mb.i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            const int i_dst = h->mb.pic.i_stride[0];
+            uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * i_dst];
+            int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+
+            /* Do the right prediction */
+            h->predict_4x4[i_mode]( p_dst, i_dst );
+
+            /* encode one 4x4 block */
+            x264_mb_encode_i4x4( h, i, i_qscale );
+
+            /* fix the pred mode value */
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = x264_mb_pred_mode4x4_fix[i_mode];
+        }
+    }
+    else    /* Inter MB */
+    {
+        int16_t dct4x4[16][4][4];
+
+        int i8x8, i4x4, idx;
+        int i_decimate_mb = 0;
+
+        /* Motion compensation */
+        x264_mb_mc( h );
+
+        h->dctf.sub16x16_dct( dct4x4,
+                              h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                              h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+        for( i8x8 = 0; i8x8 < 4; i8x8++ )
+        {
+            int i_decimate_8x8;
+
+            /* encode one 4x4 block */
+            i_decimate_8x8 = 0;
+            for( i4x4 = 0; i4x4 < 4; i4x4++ )
+            {
+                idx = i8x8 * 4 + i4x4;
+
+                quant_4x4( dct4x4[idx], i_qscale, 0 );
+                scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
+                x264_mb_dequant_4x4( dct4x4[idx], i_qscale );
+
+                i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
+            }
+
+            /* decimate this 8x8 block */
+            i_decimate_mb += i_decimate_8x8;
+            if( i_decimate_8x8 < 4 )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    int x, y;
+                    idx = i8x8 * 4 + i4x4;
+                    for( i = 0; i < 16; i++ )
+                    {
+                        h->dct.block[idx].luma4x4[i] = 0;
+                    }
+                    for( x = 0; x < 4; x++ )
+                    {
+                        for( y = 0; y < 4; y++ )
+                        {
+                            dct4x4[idx][x][y] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if( i_decimate_mb < 6 )
+        {
+            for( idx = 0; idx < 16; idx++ )
+            {
+                for( i = 0; i < 16; i++ )
+                {
+                    h->dct.block[idx].luma4x4[i] = 0;
+                }
+            }
+        }
+        else
+        {
+            h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 );
+        }
+    }
+
+    /* encode chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( h->mb.i_type ) )
+    {
+        const int i_mode = h->mb.i_chroma_pred_mode;
+        /* do the right prediction */
+        h->predict_8x8[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
+        h->predict_8x8[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
+
+        /* fix the pred mode value */
+        h->mb.i_chroma_pred_mode = x264_mb_pred_mode8x8_fix[i_mode];
+    }
+
+    /* encode the 8x8 blocks */
+    x264_mb_encode_8x8( h, !IS_INTRA( h->mb.i_type ), i_qscale );
+
+    /* Calculate the Luma/Chroma patern and non_zero_count */
+    if( h->mb.i_type == I_16x16 )
+    {
+        h->mb.i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
+            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+            if( nz > 0 )
+            {
+                h->mb.i_cbp_luma = 0x0f;
+            }
+        }
+    }
+    else
+    {
+        h->mb.i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
+            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+            if( nz > 0 )
+            {
+                h->mb.i_cbp_luma |= 1 << (i/4);
+            }
+        }
+    }
+
+    /* Calculate the chroma patern */
+    h->mb.i_cbp_chroma = 0x00;
+    for( i = 0; i < 8; i++ )
+    {
+        const int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
+        h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
+        if( nz > 0 )
+        {
+            h->mb.i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
+        }
+    }
+    if( h->mb.i_cbp_chroma == 0x00 &&
+        ( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 || array_non_zero_count( h->dct.chroma_dc[1], 4 ) ) > 0 )
+    {
+        h->mb.i_cbp_chroma = 0x01;    /* dc only */
+    }
+
+    if( h->param.b_cabac )
+    {
+        if( h->mb.i_type == I_16x16 && array_non_zero_count( h->dct.luma16x16_dc, 16 ) > 0 )
+            i_cbp_dc = 0x01;
+        else
+            i_cbp_dc = 0x00;
+
+        if( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 )
+            i_cbp_dc |= 0x02;
+        if( array_non_zero_count( h->dct.chroma_dc[1], 4 ) > 0 )
+            i_cbp_dc |= 0x04;
+    }
+
+    /* store cbp */
+    h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
+
+    /* Check for P_SKIP
+     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
+     *      (if multiple mv give same result)*/
+    if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
+        h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
+        h->mb.qp[h->mb.i_mb_xy] == h->mb.i_last_qp )
+    {
+        if( h->mb.cache.ref[0][x264_scan8[0]] == 0 )
+        {
+            int mvp[2];
+
+            x264_mb_predict_mv_pskip( h, mvp );
+            if( h->mb.cache.mv[0][x264_scan8[0]][0] == mvp[0] &&
+                h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
+            {
+                h->mb.type[h->mb.i_mb_xy] = h->mb.i_type = P_SKIP;
+            }
+        }
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_probe_pskip:
+ *  Check if the current MB could be encoded as a P_SKIP (it supposes you use
+ *  the previous QP
+ *****************************************************************************/
+int x264_macroblock_probe_pskip( x264_t *h )
+{
+    DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
+    DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
+    DECLARE_ALIGNED( int,     dctscan[16], 16 );
+
+    int i_qp;
+    int mvp[2];
+    int ch;
+    int n;
+
+    int i8x8, i4x4;
+    int i_decimate_mb;
+
+    /* quantification scale */
+    i_qp = h->mb.qp[h->mb.i_mb_xy];
+
+    /* Get the MV */
+    x264_mb_predict_mv_pskip( h, mvp );
+
+    /* Special case, need to clip the vector */
+    n = 16 * h->mb.i_mb_x + mvp[0];
+    if( n < -24 )
+        mvp[0] = -24 - 16*h->mb.i_mb_x;
+    else if( n > 16 * h->sps->i_mb_width + 24 )
+        mvp[0] = 16 * ( h->sps->i_mb_width - h->mb.i_mb_x ) + 24;
+
+    n = 16 * h->mb.i_mb_y + mvp[1];
+    if( n < -24 )
+        mvp[1] = -24 - 16*h->mb.i_mb_y;
+    else if( n > 16 * h->sps->i_mb_height + 8 )
+        mvp[1] = 16 * ( h->sps->i_mb_height - h->mb.i_mb_y ) + 8;
+
+
+    /* Motion compensation */
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
+                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
+                    mvp[0], mvp[1], 16, 16 );
+
+    /* get luma diff */
+    h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                                  h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+    for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
+    {
+        /* encode one 4x4 block */
+        for( i4x4 = 0; i4x4 < 4; i4x4++ )
+        {
+            const int idx = i8x8 * 4 + i4x4;
+
+            quant_4x4( dct4x4[idx], i_qp, 0 );
+            scan_zigzag_4x4full( dctscan, dct4x4[idx] );
+
+            i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
+
+            if( i_decimate_mb >= 6 )
+            {
+                /* not as P_SKIP */
+                return 0;
+            }
+        }
+    }
+
+    /* encode chroma */
+    i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        const int i_stride = h->mb.pic.i_stride[1+ch];
+        uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
+        uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
+
+        h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][1+ch], i_stride,
+                          h->mb.pic.p_fdec[1+ch],       i_stride,
+                          mvp[0], mvp[1], 8, 8 );
+
+        h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+
+        /* calculate dct DC */
+        dct2x2[0][0] = dct4x4[0][0][0];
+        dct2x2[0][1] = dct4x4[1][0][0];
+        dct2x2[1][0] = dct4x4[2][0][0];
+        dct2x2[1][1] = dct4x4[3][0][0];
+        h->dctf.dct2x2dc( dct2x2 );
+        quant_2x2_dc( dct2x2, i_qp, 0 );
+        if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
+        {
+            /* can't be */
+            return 0;
+        }
+
+        /* calculate dct coeffs */
+        for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
+        {
+            quant_4x4( dct4x4[i4x4], i_qp, 0 );
+            scan_zigzag_4x4( dctscan, dct4x4[i4x4] );
+
+            i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
+            if( i_decimate_mb >= 7 )
+            {
+                return 0;
+            }
+        }
+    }
+
+    return 1;
+}
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
new file mode 100644
index 00000000..b030755c
--- /dev/null
+++ b/encoder/macroblock.h
@@ -0,0 +1,38 @@
+/*****************************************************************************
+ * macroblock.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ENCODER_MACROBLOCK_H
+#define _ENCODER_MACROBLOCK_H 1
+
+#include "../core/macroblock.h"
+
+int x264_macroblock_probe_pskip( x264_t *h );
+
+void x264_macroblock_encode      ( x264_t *h );
+void x264_macroblock_write_cabac ( x264_t *h, bs_t *s );
+void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
+
+void x264_cabac_mb_skip( x264_t *h, int b_skip );
+
+#endif
+
diff --git a/encoder/me.c b/encoder/me.c
new file mode 100644
index 00000000..b1653754
--- /dev/null
+++ b/encoder/me.c
@@ -0,0 +1,194 @@
+/*****************************************************************************
+ * me.c: h264 encoder library (Motion Estimation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: me.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "me.h"
+
+void x264_me_search( x264_t *h, x264_me_t *m )
+{
+    const int i_pixel = m->i_pixel;
+    int bcost;
+    int bmx, bmy;
+    uint8_t *p_fref = m->p_fref;
+    int i_iter;
+
+
+    /* init with mvp */
+    /* XXX: We don't need to clamp because the way diamond work, we will
+     * never go outside padded picture, and predict mv won't compute vector
+     * with componant magnitude greater.
+     * XXX: if some vector can go outside, (accelerator, ....) you need to clip
+     * them yourself */
+    bmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+    bmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+
+    p_fref = &m->p_fref[bmy * m->i_stride + bmx];
+    bcost = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride, p_fref, m->i_stride );
+
+
+    /* try a candidate if provided */
+    if( m->b_mvc )
+    {
+        const int mx = x264_clip3( ( m->mvc[0] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+        const int my = x264_clip3( ( m->mvc[1] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+        uint8_t *p_fref2 = &m->p_fref[my*m->i_stride+mx];
+        int cost = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride, p_fref2, m->i_stride ) +
+                   m->lm * ( bs_size_se( m->mvc[0] - m->mvp[0] ) + bs_size_se( m->mvc[1] - m->mvp[1] ) );
+        if( cost < bcost )
+        {
+            bmx = mx;
+            bmy = my;
+            bcost = cost;
+            p_fref = p_fref2;
+        }
+    }
+
+    /* Don't need to test mv_range each time, we won't go outside picture+padding */
+    /* diamond */
+    for( i_iter = 0; i_iter < 16; i_iter++ )
+    {
+        int best = 0;
+        int cost[4];
+
+#define COST_MV( c, dx, dy ) \
+        (c) = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride,                    \
+                               &p_fref[(dy)*m->i_stride+(dx)], m->i_stride ) + \
+              m->lm * ( bs_size_se(((bmx+(dx))<<2) - m->mvp[0] ) +         \
+                        bs_size_se(((bmy+(dy))<<2) - m->mvp[1] ) )
+
+        COST_MV( cost[0],  0, -1 );
+        COST_MV( cost[1],  0,  1 );
+        COST_MV( cost[2], -1,  0 );
+        COST_MV( cost[3],  1,  0 );
+#undef COST_MV
+
+        if( cost[1] < cost[0] )    best = 1;
+        if( cost[2] < cost[best] ) best = 2;
+        if( cost[3] < cost[best] ) best = 3;
+
+        if( bcost <= cost[best] )
+            break;
+
+        bcost = cost[best];
+
+        if( best == 0 ) {
+            bmy--;
+            p_fref -= m->i_stride;
+        } else if( best == 1 ) {
+            bmy++;
+            p_fref += m->i_stride;
+        } else if( best == 2 ) {
+            bmx--;
+            p_fref--;
+        } else if( best == 3 ) {
+            bmx++;
+            p_fref++;
+        }
+    }
+
+    /* -> qpel mv */
+    m->mv[0] = bmx << 2;
+    m->mv[1] = bmy << 2;
+
+    /* compute the real cost */
+    m->cost = h->pixf.satd[i_pixel]( m->p_fenc, m->i_stride, p_fref, m->i_stride ) +
+                m->lm * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
+                          bs_size_se( m->mv[1] - m->mvp[1] ) );
+}
+
+void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
+{
+    const int bw = x264_pixel_size[m->i_pixel].w;
+    const int bh = x264_pixel_size[m->i_pixel].h;
+
+    DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
+    int cost[4];
+    int best;
+
+    int bmx = m->mv[0];
+    int bmy = m->mv[1];
+
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 2, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 2, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 2, bmy + 0, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 2, bmy + 0, bw, bh );
+
+    cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 2 - m->mvp[1] ) );
+    cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 2 - m->mvp[1] ) );
+    cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
+              m->lm * ( bs_size_se( bmx - 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+    cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
+              m->lm * ( bs_size_se( bmx + 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+
+    best = 0;
+    if( cost[1] < cost[0] )    best = 1;
+    if( cost[2] < cost[best] ) best = 2;
+    if( cost[3] < cost[best] ) best = 3;
+
+    if( cost[best] < m->cost )
+    {
+        m->cost = cost[best];
+        if( best == 0 )      bmy -= 2;
+        else if( best == 1 ) bmy += 2;
+        else if( best == 2 ) bmx -= 2;
+        else if( best == 3 ) bmx += 2;
+    }
+
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 1, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 1, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 1, bmy + 0, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 1, bmy + 0, bw, bh );
+
+    cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 1 - m->mvp[1] ) );
+    cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 1 - m->mvp[1] ) );
+    cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
+              m->lm * ( bs_size_se( bmx - 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+    cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
+              m->lm * ( bs_size_se( bmx + 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+
+    best = 0;
+    if( cost[1] < cost[0] )    best = 1;
+    if( cost[2] < cost[best] ) best = 2;
+    if( cost[3] < cost[best] ) best = 3;
+
+    if( cost[best] < m->cost )
+    {
+        m->cost = cost[best];
+        if( best == 0 )      bmy--;
+        else if( best == 1 ) bmy++;
+        else if( best == 2 ) bmx--;
+        else if( best == 3 ) bmx++;
+    }
+
+    m->mv[0] = bmx;
+    m->mv[1] = bmy;
+}
diff --git a/encoder/me.h b/encoder/me.h
new file mode 100644
index 00000000..bc639a3e
--- /dev/null
+++ b/encoder/me.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * me.h: h264 encoder library (Motion Estimation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: me.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ME_H
+#define _ME_H 1
+
+typedef struct
+{
+    /* input */
+    int      i_pixel;   /* PIXEL_WxH */
+    int      lm;        /* lambda motion */
+
+    uint8_t *p_fref;
+    uint8_t *p_fenc;
+    int      i_stride;
+
+    int i_mv_range;
+
+    int mvp[2];
+
+    int b_mvc;
+    int mvc[2];
+
+    /* output */
+    int cost;           /* satd + lm * nbits */
+    int mv[2];
+} x264_me_t;
+
+void x264_me_search( x264_t *h, x264_me_t *m );
+void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
+
+#endif
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
new file mode 100644
index 00000000..56a5af9b
--- /dev/null
+++ b/encoder/ratecontrol.c
@@ -0,0 +1,96 @@
+/*****************************************************************************
+ * ratecontrol.c: h264 encoder library (Rate Control)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ratecontrol.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "ratecontrol.h"
+
+
+x264_ratecontrol_t *x264_ratecontrol_new( x264_param_t *param )
+{
+    x264_ratecontrol_t *rc = x264_malloc( sizeof( x264_ratecontrol_t ) );
+
+    rc->fps = param->f_fps > 0.1 ? param->f_fps : 25.0f;
+    rc->i_iframe = param->i_iframe;
+    rc->i_bitrate = param->i_bitrate * 1000;
+
+    rc->i_qp_last = 26;
+    rc->i_qp      = param->i_qp_constant;
+
+    rc->i_frames  = 0;
+    rc->i_size    = 0;
+
+    return rc;
+}
+
+void x264_ratecontrol_delete( x264_ratecontrol_t *rc )
+{
+    x264_free( rc );
+}
+
+void x264_ratecontrol_start( x264_ratecontrol_t *rc, int i_slice_type )
+{
+    rc->i_slice_type = i_slice_type;
+}
+
+int  x264_ratecontrol_qp( x264_ratecontrol_t *rc )
+{
+    return x264_clip3( rc->i_qp, 1, 51 );
+}
+
+void x264_ratecontrol_end( x264_ratecontrol_t *rc, int bits )
+{
+    return;
+#if 0
+    int i_avg;
+    int i_target = rc->i_bitrate / rc->fps;
+    int i_qp = rc->i_qp;
+
+    rc->i_qp_last = rc->i_qp;
+    rc->i_frames++;
+    rc->i_size += bits / 8;
+
+    i_avg = 8 * rc->i_size / rc->i_frames;
+
+    if( rc->i_slice_type == SLICE_TYPE_I )
+    {
+        i_target = i_target * 20 / 10;
+    }
+
+    if( i_avg > i_target * 11 / 10 )
+    {
+        i_qp = rc->i_qp + ( i_avg / i_target - 1 );
+    }
+    else if( i_avg < i_target * 9 / 10 )
+    {
+        i_qp = rc->i_qp - ( i_target / i_avg - 1 );
+    }
+
+    rc->i_qp = x264_clip3( i_qp, rc->i_qp_last - 2, rc->i_qp_last + 2 );
+#endif
+}
+
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
new file mode 100644
index 00000000..5fa3c7c9
--- /dev/null
+++ b/encoder/ratecontrol.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * ratecontrol.h: h264 encoder library (Rate Control)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ratecontrol.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _RATECONTROL_H
+#define _RATECONTROL_H 1
+
+struct x264_ratecontrol_t
+{
+    float fps;
+    int   i_iframe;
+
+    int i_bitrate;
+    int i_qp_last;
+    int i_qp;
+
+    int i_slice_type;
+
+    int     i_frames;
+    int64_t i_size;
+
+};
+
+
+x264_ratecontrol_t *x264_ratecontrol_new   ( x264_param_t * );
+void                x264_ratecontrol_delete( x264_ratecontrol_t * );
+
+void x264_ratecontrol_start( x264_ratecontrol_t *, int i_slice_type );
+int  x264_ratecontrol_qp( x264_ratecontrol_t * );
+void x264_ratecontrol_end( x264_ratecontrol_t *, int bits );
+
+#endif
+
diff --git a/encoder/set.c b/encoder/set.c
new file mode 100644
index 00000000..56284642
--- /dev/null
+++ b/encoder/set.c
@@ -0,0 +1,382 @@
+/*****************************************************************************
+ * set: h264 encoder (SPS and SPS init and write)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "../core/bs.h"
+#include "../core/set.h"
+
+void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
+{
+    sps->i_id               = i_id;
+
+    if( param->b_cabac || param->i_bframe > 0 )
+        sps->i_profile_idc      = PROFILE_MAIN;
+    else
+        sps->i_profile_idc      = PROFILE_BASELINE;
+
+    sps->i_level_idc        = 21;               /* FIXME ? */
+    sps->b_constraint_set0  = 0;
+    sps->b_constraint_set1  = 0;
+    sps->b_constraint_set2  = 0;
+
+    sps->i_log2_max_frame_num = 4;  /* at least 4 */
+    while( (1 << sps->i_log2_max_frame_num) <= param->i_idrframe * param->i_iframe )
+    {
+        sps->i_log2_max_frame_num++;
+    }
+    sps->i_log2_max_frame_num++;    /* just in case */
+
+    sps->i_poc_type = 0;
+    if( sps->i_poc_type == 0 )
+    {
+        sps->i_log2_max_poc_lsb = sps->i_log2_max_frame_num + 1;    /* max poc = 2*frame_num */
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+
+        /* FIXME */
+        sps->b_delta_pic_order_always_zero = 1;
+        sps->i_offset_for_non_ref_pic = 0;
+        sps->i_offset_for_top_to_bottom_field = 0;
+        sps->i_num_ref_frames_in_poc_cycle = 0;
+
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            sps->i_offset_for_ref_frame[i] = 0;
+        }
+    }
+
+    sps->i_num_ref_frames = param->i_frame_reference + 1; /* +1 for 2 ref in B */
+    sps->b_gaps_in_frame_num_value_allowed = 0;
+    sps->i_mb_width = ( param->i_width + 15 ) / 16;
+    sps->i_mb_height= ( param->i_height + 15 )/ 16;
+    sps->b_frame_mbs_only = 1;
+    sps->b_mb_adaptive_frame_field = 0;
+    sps->b_direct8x8_inference = 0;
+    if( sps->b_frame_mbs_only == 0 )
+    {
+        sps->b_direct8x8_inference = 1;
+    }
+
+    if( param->i_width % 16 != 0 || param->i_height % 16 != 0 )
+    {
+        sps->b_crop = 1;
+        sps->crop.i_left    = 0;
+        sps->crop.i_right   = ( 16 - param->i_width % 16)/2;
+        sps->crop.i_top     = 0;
+        sps->crop.i_bottom  = ( 16 - param->i_height % 16)/2;
+    }
+    else
+    {
+        sps->b_crop = 0;
+        sps->crop.i_left    = 0;
+        sps->crop.i_right   = 0;
+        sps->crop.i_top     = 0;
+        sps->crop.i_bottom  = 0;
+    }
+
+    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
+    {
+        int w = param->vui.i_sar_width;
+        int h = param->vui.i_sar_height;
+        int a = w, b = h;
+
+        while( b != 0 )
+        {
+            int t = a;
+
+            a = b;
+            b = t % b;
+        }
+
+        w /= a;
+        h /= a;
+        while( w > 65535 || h > 65535 )
+        {
+            w /= 2;
+            h /= 2;
+        }
+
+        if( w == 0 || h == 0 )
+        {
+            fprintf( stderr, "x264: cannot create valid sample aspect ratio\n" );
+            sps->b_vui = 0;
+        }
+        else if( w == h )
+        {
+            fprintf( stderr, "x264: no need for a SAR\n" );
+            sps->b_vui = 0;
+        }
+        else
+        {
+            fprintf( stderr, "x264: using SAR=%d/%d\n", w, h );
+            sps->b_vui = 1;
+            sps->vui.i_sar_width = w;
+            sps->vui.i_sar_height= h;
+        }
+    }
+    else
+    {
+        sps->b_vui = 0;
+    }
+}
+
+
+void x264_sps_write( bs_t *s, x264_sps_t *sps )
+{
+    bs_write( s, 8, sps->i_profile_idc );
+    bs_write( s, 1, sps->b_constraint_set0 );
+    bs_write( s, 1, sps->b_constraint_set1 );
+    bs_write( s, 1, sps->b_constraint_set2 );
+
+    bs_write( s, 5, 0 );    /* reserved */
+
+    bs_write( s, 8, sps->i_level_idc );
+
+    bs_write_ue( s, sps->i_id );
+    bs_write_ue( s, sps->i_log2_max_frame_num - 4 );
+    bs_write_ue( s, sps->i_poc_type );
+    if( sps->i_poc_type == 0 )
+    {
+        bs_write_ue( s, sps->i_log2_max_poc_lsb - 4 );
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+
+        bs_write( s, 1, sps->b_delta_pic_order_always_zero );
+        bs_write_se( s, sps->i_offset_for_non_ref_pic );
+        bs_write_se( s, sps->i_offset_for_top_to_bottom_field );
+        bs_write_ue( s, sps->i_num_ref_frames_in_poc_cycle );
+
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            bs_write_se( s, sps->i_offset_for_ref_frame[i] );
+        }
+    }
+    bs_write_ue( s, sps->i_num_ref_frames );
+    bs_write( s, 1, sps->b_gaps_in_frame_num_value_allowed );
+    bs_write_ue( s, sps->i_mb_width - 1 );
+    bs_write_ue( s, sps->i_mb_height - 1);
+    bs_write( s, 1, sps->b_frame_mbs_only );
+    if( !sps->b_frame_mbs_only )
+    {
+        bs_write( s, 1, sps->b_mb_adaptive_frame_field );
+    }
+    bs_write( s, 1, sps->b_direct8x8_inference );
+
+    bs_write( s, 1, sps->b_crop );
+    if( sps->b_crop )
+    {
+        bs_write_ue( s, sps->crop.i_left );
+        bs_write_ue( s, sps->crop.i_right );
+        bs_write_ue( s, sps->crop.i_top );
+        bs_write_ue( s, sps->crop.i_bottom );
+    }
+
+    bs_write( s, 1, sps->b_vui );
+    if( sps->b_vui )
+    {
+        int i;
+        static const struct { int w, h; int sar; } sar[] =
+        {
+            { 1,   1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
+            { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
+            { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
+            { 160,99, 13}, { 0, 0, -1 }
+        };
+        bs_write1( s, 1 );      /* aspect_ratio_info_present_flag */
+        for( i = 0; sar[i].sar != -1; i++ )
+        {
+            if( sar[i].w == sps->vui.i_sar_width && sar[i].h == sps->vui.i_sar_height )
+                break;
+        }
+        if( sar[i].sar != -1 )
+        {
+            bs_write( s, 8, sar[i].sar );
+        }
+        else
+        {
+            bs_write( s, 8, 255);   /* aspect_ration_idc (extented) */
+            bs_write( s, 16, sps->vui.i_sar_width );
+            bs_write( s, 16, sps->vui.i_sar_height );
+        }
+
+        bs_write1( s, 0 );      /* overscan_info_present_flag */
+
+        bs_write1( s, 0 );      /* video_signal_type_present_flag */
+#if 0
+        bs_write( s, 3, 5 );    /* unspecified video format */
+        bs_write1( s, 1 );      /* video full range flag */
+        bs_write1( s, 0 );      /* colour description present flag */
+#endif
+        bs_write1( s, 0 );      /* chroma_loc_info_present_flag */
+        bs_write1( s, 0 );      /* timing_info_present_flag */
+        bs_write1( s, 0 );      /* nal_hrd_parameters_present_flag */
+        bs_write1( s, 0 );      /* vcl_hrd_parameters_present_flag */
+        bs_write1( s, 0 );      /* pic_struct_present_flag */
+        bs_write1( s, 0 );      /* bitstream_restriction_flag */
+    }
+
+    bs_rbsp_trailing( s );
+}
+
+void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
+{
+    pps->i_id = i_id;
+    pps->i_sps_id = sps->i_id;
+    pps->b_cabac = param->b_cabac;
+
+    pps->b_pic_order = 0;
+    pps->i_num_slice_groups = 1;
+
+    if( pps->i_num_slice_groups > 1 )
+    {
+        int i;
+
+        pps->i_slice_group_map_type = 0;
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_run_length[i] = 1;
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_top_left[i] = 0;
+                pps->i_bottom_right[i] = 0;
+            }
+        }
+        else if( pps->i_slice_group_map_type >= 3 &&
+                 pps->i_slice_group_map_type <= 5 )
+        {
+            pps->b_slice_group_change_direction = 0;
+            pps->i_slice_group_change_rate = 0;
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            pps->i_pic_size_in_map_units = 1;
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+                pps->i_slice_group_id[i] = 0;
+            }
+        }
+    }
+    pps->i_num_ref_idx_l0_active = 1;
+    pps->i_num_ref_idx_l1_active = 1;
+
+    pps->b_weighted_pred = 0;
+    pps->b_weighted_bipred = 0;
+
+    pps->i_pic_init_qp = 26;
+    pps->i_pic_init_qs = 26;
+
+    pps->i_chroma_qp_index_offset = 0;
+#if 0
+    if( !param->b_deblocking_filter )
+    {
+        pps->b_deblocking_filter_control = 1;
+    }
+    else
+    {
+        pps->b_deblocking_filter_control = 1;
+    }
+#endif
+    pps->b_deblocking_filter_control = 1;
+    pps->b_constrained_intra_pred = 0;
+    pps->b_redundant_pic_cnt = 0;
+}
+
+void x264_pps_write( bs_t *s, x264_pps_t *pps )
+{
+    bs_write_ue( s, pps->i_id );
+    bs_write_ue( s, pps->i_sps_id );
+
+    bs_write( s, 1, pps->b_cabac );
+    bs_write( s, 1, pps->b_pic_order );
+    bs_write_ue( s, pps->i_num_slice_groups - 1 );
+
+    if( pps->i_num_slice_groups > 1 )
+    {
+        int i;
+
+        bs_write_ue( s, pps->i_slice_group_map_type );
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                bs_write_ue( s, pps->i_run_length[i] - 1 );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                bs_write_ue( s, pps->i_top_left[i] );
+                bs_write_ue( s, pps->i_bottom_right[i] );
+            }
+        }
+        else if( pps->i_slice_group_map_type >= 3 &&
+                 pps->i_slice_group_map_type <= 5 )
+        {
+            bs_write( s, 1, pps->b_slice_group_change_direction );
+            bs_write_ue( s, pps->b_slice_group_change_direction - 1 );
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            bs_write_ue( s, pps->i_pic_size_in_map_units - 1 );
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+                /* FIXME */
+                /* bs_write( s, ceil( log2( pps->i_pic_size_in_map_units +1 ) ),
+                 *              pps->i_slice_group_id[i] );
+                 */
+            }
+        }
+    }
+
+    bs_write_ue( s, pps->i_num_ref_idx_l0_active - 1 );
+    bs_write_ue( s, pps->i_num_ref_idx_l1_active - 1 );
+    bs_write( s, 1, pps->b_weighted_pred );
+    bs_write( s, 2, pps->b_weighted_bipred );
+
+    bs_write_se( s, pps->i_pic_init_qp - 26 );
+    bs_write_se( s, pps->i_pic_init_qs - 26 );
+    bs_write_se( s, pps->i_chroma_qp_index_offset );
+
+    bs_write( s, 1, pps->b_deblocking_filter_control );
+    bs_write( s, 1, pps->b_constrained_intra_pred );
+    bs_write( s, 1, pps->b_redundant_pic_cnt );
+
+    bs_rbsp_trailing( s );
+}
+
diff --git a/encoder/set.h b/encoder/set.h
new file mode 100644
index 00000000..44cae088
--- /dev/null
+++ b/encoder/set.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * set.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ENCODER_SET_H
+#define _ENCODER_SET_H 1
+
+void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
+void x264_sps_write( bs_t *s, x264_sps_t *sps );
+void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
+void x264_pps_write( bs_t *s, x264_pps_t *pps );
+
+#endif
diff --git a/extras/getopt.c b/extras/getopt.c
new file mode 100644
index 00000000..3b5e196b
--- /dev/null
+++ b/extras/getopt.c
@@ -0,0 +1,503 @@
+/*	$NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $	*/
+
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#define REPLACE_GETOPT
+
+#define _DIAGASSERT(x) do {} while (0)
+
+#ifdef REPLACE_GETOPT
+#ifdef __weak_alias
+__weak_alias(getopt,_getopt)
+#endif
+int opterr = 1;	/* if error message should be printed */
+int optind = 1;	/* index into parent argv vector */
+int optopt = '?';	/* character checked for validity */
+int optreset;	/* reset getopt */
+char *optarg;	/* argument associated with option */
+#endif
+
+#ifdef __weak_alias
+__weak_alias(getopt_long,_getopt_long)
+#endif
+
+#ifndef __CYGWIN__
+#define __progname __argv[0]
+#else
+extern char *__progname;
+#endif
+
+#define IGNORE_FIRST	(*options == '-' || *options == '+')
+#define PRINT_ERROR	((opterr) && ((*options != ':') \
+				      || (IGNORE_FIRST && options[1] != ':')))
+
+#define IS_POSIXLY_CORRECT (getenv("POSIXLY_INCORRECT_GETOPT") == NULL)
+
+#define PERMUTE         (!IS_POSIXLY_CORRECT && !IGNORE_FIRST)
+/* XXX: GNU ignores PC if *options == '-' */
+#define IN_ORDER        (!IS_POSIXLY_CORRECT && *options == '-')
+
+/* return values */
+#define	BADCH	(int)'?'
+#define	BADARG		((IGNORE_FIRST && options[1] == ':') \
+			 || (*options == ':') ? (int)':' : (int)'?')
+#define INORDER (int)1
+
+static char EMSG[1];
+
+static int getopt_internal (int, char * const *, const char *);
+static int gcd (int, int);
+static void permute_args (int, int, int, char * const *);
+
+static char *place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1;   /* first option after non options (for permute) */
+
+/* Error messages */
+static const char recargchar[] = "option requires an argument -- %c";
+static const char recargstring[] = "option requires an argument -- %s";
+static const char ambig[] = "ambiguous option -- %.*s";
+static const char noarg[] = "option doesn't take an argument -- %.*s";
+static const char illoptchar[] = "unknown option -- %c";
+static const char illoptstring[] = "unknown option -- %s";
+
+static void
+_vwarnx(const char *fmt, va_list ap)
+{
+  (void)fprintf(stderr, "%s: ", __progname);
+  if (fmt != NULL)
+    (void)vfprintf(stderr, fmt, ap);
+  (void)fprintf(stderr, "\n");
+}
+
+static void
+warnx(const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  _vwarnx(fmt, ap);
+  va_end(ap);
+}
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int
+gcd(a, b)
+	int a;
+	int b;
+{
+	int c;
+
+	c = a % b;
+	while (c != 0) {
+		a = b;
+		b = c;
+		c = a % b;
+	}
+
+	return b;
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void
+permute_args(panonopt_start, panonopt_end, opt_end, nargv)
+	int panonopt_start;
+	int panonopt_end;
+	int opt_end;
+	char * const *nargv;
+{
+	int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+	char *swap;
+
+	_DIAGASSERT(nargv != NULL);
+
+	/*
+	 * compute lengths of blocks and number and size of cycles
+	 */
+	nnonopts = panonopt_end - panonopt_start;
+	nopts = opt_end - panonopt_end;
+	ncycle = gcd(nnonopts, nopts);
+	cyclelen = (opt_end - panonopt_start) / ncycle;
+
+	for (i = 0; i < ncycle; i++) {
+		cstart = panonopt_end+i;
+		pos = cstart;
+		for (j = 0; j < cyclelen; j++) {
+			if (pos >= panonopt_end)
+				pos -= nnonopts;
+			else
+				pos += nopts;
+			swap = nargv[pos];
+			/* LINTED const cast */
+			((char **) nargv)[pos] = nargv[cstart];
+			/* LINTED const cast */
+			((char **)nargv)[cstart] = swap;
+		}
+	}
+}
+
+/*
+ * getopt_internal --
+ *	Parse argc/argv argument vector.  Called by user level routines.
+ *  Returns -2 if -- is found (can be long option or end of options marker).
+ */
+static int
+getopt_internal(nargc, nargv, options)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+{
+	char *oli;				/* option letter list index */
+	int optchar;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+
+	optarg = NULL;
+
+	/*
+	 * XXX Some programs (like rsyncd) expect to be able to
+	 * XXX re-initialize optind to 0 and have getopt_long(3)
+	 * XXX properly function again.  Work around this braindamage.
+	 */
+	if (optind == 0)
+		optind = 1;
+
+	if (optreset)
+		nonopt_start = nonopt_end = -1;
+start:
+	if (optreset || !*place) {		/* update scanning pointer */
+		optreset = 0;
+		if (optind >= nargc) {          /* end of argument vector */
+			place = EMSG;
+			if (nonopt_end != -1) {
+				/* do permutation, if we have to */
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			else if (nonopt_start != -1) {
+				/*
+				 * If we skipped non-options, set optind
+				 * to the first of them.
+				 */
+				optind = nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return -1;
+		}
+		if ((*(place = nargv[optind]) != '-')
+		    || (place[1] == '\0')) {    /* found non-option */
+			place = EMSG;
+			if (IN_ORDER) {
+				/*
+				 * GNU extension:
+				 * return non-option as argument to option 1
+				 */
+				optarg = nargv[optind++];
+				return INORDER;
+			}
+			if (!PERMUTE) {
+				/*
+				 * if no permutation wanted, stop parsing
+				 * at first non-option
+				 */
+				return -1;
+			}
+			/* do permutation */
+			if (nonopt_start == -1)
+				nonopt_start = optind;
+			else if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				nonopt_start = optind -
+				    (nonopt_end - nonopt_start);
+				nonopt_end = -1;
+			}
+			optind++;
+			/* process next argument */
+			goto start;
+		}
+		if (nonopt_start != -1 && nonopt_end == -1)
+			nonopt_end = optind;
+		if (place[1] && *++place == '-') {	/* found "--" */
+			place++;
+			return -2;
+		}
+	}
+	if ((optchar = (int)*place++) == (int)':' ||
+	    (oli = strchr(options + (IGNORE_FIRST ? 1 : 0), optchar)) == NULL) {
+		/* option letter unknown or ':' */
+		if (!*place)
+			++optind;
+		if (PRINT_ERROR)
+			warnx(illoptchar, optchar);
+		optopt = optchar;
+		return BADCH;
+	}
+	if (optchar == 'W' && oli[1] == ';') {		/* -W long-option */
+		/* XXX: what if no long options provided (called by getopt)? */
+		if (*place)
+			return -2;
+
+		if (++optind >= nargc) {	/* no arg */
+			place = EMSG;
+			if (PRINT_ERROR)
+				warnx(recargchar, optchar);
+			optopt = optchar;
+			return BADARG;
+		} else				/* white space */
+			place = nargv[optind];
+		/*
+		 * Handle -W arg the same as --arg (which causes getopt to
+		 * stop parsing).
+		 */
+		return -2;
+	}
+	if (*++oli != ':') {			/* doesn't take argument */
+		if (!*place)
+			++optind;
+	} else {				/* takes (optional) argument */
+		optarg = NULL;
+		if (*place)			/* no white space */
+			optarg = place;
+		/* XXX: disable test for :: if PC? (GNU doesn't) */
+		else if (oli[1] != ':') {	/* arg not optional */
+			if (++optind >= nargc) {	/* no arg */
+				place = EMSG;
+				if (PRINT_ERROR)
+					warnx(recargchar, optchar);
+				optopt = optchar;
+				return BADARG;
+			} else
+				optarg = nargv[optind];
+		}
+		place = EMSG;
+		++optind;
+	}
+	/* dump back option letter */
+	return optchar;
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the real getopt]
+ */
+int
+getopt(nargc, nargv, options)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+{
+	int retval;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+
+	if ((retval = getopt_internal(nargc, nargv, options)) == -2) {
+		++optind;
+		/*
+		 * We found an option (--), so if we skipped non-options,
+		 * we have to permute.
+		 */
+		if (nonopt_end != -1) {
+			permute_args(nonopt_start, nonopt_end, optind,
+				       nargv);
+			optind -= nonopt_end - nonopt_start;
+		}
+		nonopt_start = nonopt_end = -1;
+		retval = -1;
+	}
+	return retval;
+}
+#endif
+
+/*
+ * getopt_long --
+ *	Parse argc/argv argument vector.
+ */
+int
+getopt_long(nargc, nargv, options, long_options, idx)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+	const struct option *long_options;
+	int *idx;
+{
+	int retval;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+	_DIAGASSERT(long_options != NULL);
+	/* idx may be NULL */
+
+	if ((retval = getopt_internal(nargc, nargv, options)) == -2) {
+		char *current_argv, *has_equal;
+		size_t current_argv_len;
+		int i, match;
+
+		current_argv = place;
+		match = -1;
+
+		optind++;
+		place = EMSG;
+
+		if (*current_argv == '\0') {		/* found "--" */
+			/*
+			 * We found an option (--), so if we skipped
+			 * non-options, we have to permute.
+			 */
+			if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return -1;
+		}
+		if ((has_equal = strchr(current_argv, '=')) != NULL) {
+			/* argument found (--option=arg) */
+			current_argv_len = has_equal - current_argv;
+			has_equal++;
+		} else
+			current_argv_len = strlen(current_argv);
+
+		for (i = 0; long_options[i].name; i++) {
+			/* find matching long option */
+			if (strncmp(current_argv, long_options[i].name,
+			    current_argv_len))
+				continue;
+
+			if (strlen(long_options[i].name) ==
+			    (unsigned)current_argv_len) {
+				/* exact match */
+				match = i;
+				break;
+			}
+			if (match == -1)		/* partial match */
+				match = i;
+			else {
+				/* ambiguous abbreviation */
+				if (PRINT_ERROR)
+					warnx(ambig, (int)current_argv_len,
+					     current_argv);
+				optopt = 0;
+				return BADCH;
+			}
+		}
+		if (match != -1) {			/* option found */
+			if (long_options[match].has_arg == no_argument
+			    && has_equal) {
+				if (PRINT_ERROR)
+					warnx(noarg, (int)current_argv_len,
+					     current_argv);
+				/*
+				 * XXX: GNU sets optopt to val regardless of
+				 * flag
+				 */
+				if (long_options[match].flag == NULL)
+					optopt = long_options[match].val;
+				else
+					optopt = 0;
+				return BADARG;
+			}
+			if (long_options[match].has_arg == required_argument ||
+			    long_options[match].has_arg == optional_argument) {
+				if (has_equal)
+					optarg = has_equal;
+				else if (long_options[match].has_arg ==
+				    required_argument) {
+					/*
+					 * optional argument doesn't use
+					 * next nargv
+					 */
+					optarg = nargv[optind++];
+				}
+			}
+			if ((long_options[match].has_arg == required_argument)
+			    && (optarg == NULL)) {
+				/*
+				 * Missing argument; leading ':'
+				 * indicates no error should be generated
+				 */
+				if (PRINT_ERROR)
+					warnx(recargstring, current_argv);
+				/*
+				 * XXX: GNU sets optopt to val regardless
+				 * of flag
+				 */
+				if (long_options[match].flag == NULL)
+					optopt = long_options[match].val;
+				else
+					optopt = 0;
+				--optind;
+				return BADARG;
+			}
+		} else {			/* unknown option */
+			if (PRINT_ERROR)
+				warnx(illoptstring, current_argv);
+			optopt = 0;
+			return BADCH;
+		}
+		if (long_options[match].flag) {
+			*long_options[match].flag = long_options[match].val;
+			retval = 0;
+		} else
+			retval = long_options[match].val;
+		if (idx)
+			*idx = match;
+	}
+	return retval;
+}
diff --git a/extras/getopt.h b/extras/getopt.h
new file mode 100644
index 00000000..18e10269
--- /dev/null
+++ b/extras/getopt.h
@@ -0,0 +1,179 @@
+/* Declarations for getopt.
+   Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+
+#ifndef _GETOPT_H
+
+#ifndef __need_getopt
+# define _GETOPT_H 1
+#endif
+
+/* If __GNU_LIBRARY__ is not already defined, either we are being used
+   standalone, or this is the first header included in the source file.
+   If we are being used with glibc, we need to include <features.h>, but
+   that does not exist if we are standalone.  So: if __GNU_LIBRARY__ is
+   not defined, include <ctype.h>, which will pull in <features.h> for us
+   if it's from glibc.  (Why ctype.h?  It's guaranteed to exist and it
+   doesn't flood the namespace with stuff the way some other headers do.)  */
+#if !defined __GNU_LIBRARY__
+# include <ctype.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* For communication from `getopt' to the caller.
+   When `getopt' finds an option that takes an argument,
+   the argument value is returned here.
+   Also, when `ordering' is RETURN_IN_ORDER,
+   each non-option ARGV-element is returned here.  */
+
+extern char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+   This is used for communication to and from the caller
+   and for communication between successive calls to `getopt'.
+
+   On entry to `getopt', zero means this is the first call; initialize.
+
+   When `getopt' returns -1, this is the index of the first of the
+   non-option elements that the caller should itself scan.
+
+   Otherwise, `optind' communicates from one call to the next
+   how much of ARGV has been scanned so far.  */
+
+extern int optind;
+
+/* Callers store zero here to inhibit the error message `getopt' prints
+   for unrecognized options.  */
+
+extern int opterr;
+
+/* Set to an option character which was unrecognized.  */
+
+extern int optopt;
+
+#ifndef __need_getopt
+/* Describe the long-named options requested by the application.
+   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
+   of `struct option' terminated by an element containing a name which is
+   zero.
+
+   The field `has_arg' is:
+   no_argument		(or 0) if the option does not take an argument,
+   required_argument	(or 1) if the option requires an argument,
+   optional_argument 	(or 2) if the option takes an optional argument.
+
+   If the field `flag' is not NULL, it points to a variable that is set
+   to the value given in the field `val' when the option is found, but
+   left unchanged if the option is not found.
+
+   To have a long-named option do something other than set an `int' to
+   a compiled-in constant, such as set a value from `optarg', set the
+   option's `flag' field to zero and its `val' field to a nonzero
+   value (the equivalent single-letter option character, if there is
+   one).  For long options that have a zero `flag' field, `getopt'
+   returns the contents of the `val' field.  */
+
+struct option
+{
+# if (defined __STDC__ && __STDC__) || defined __cplusplus
+  const char *name;
+# else
+  char *name;
+# endif
+  /* has_arg can't be an enum because some compilers complain about
+     type mismatches in all the code that assumes it is an int.  */
+  int has_arg;
+  int *flag;
+  int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct option'.  */
+
+# define no_argument		0
+# define required_argument	1
+# define optional_argument	2
+#endif	/* need getopt */
+
+
+/* Get definitions and prototypes for functions to process the
+   arguments in ARGV (ARGC of them, minus the program name) for
+   options given in OPTS.
+
+   Return the option character from OPTS just read.  Return -1 when
+   there are no more options.  For unrecognized options, or options
+   missing arguments, `optopt' is set to the option letter, and '?' is
+   returned.
+
+   The OPTS string is a list of characters which are recognized option
+   letters, optionally followed by colons, specifying that that letter
+   takes an argument, to be placed in `optarg'.
+
+   If a letter in OPTS is followed by two colons, its argument is
+   optional.  This behavior is specific to the GNU `getopt'.
+
+   The argument `--' causes premature termination of argument
+   scanning, explicitly telling `getopt' that there are no more
+   options.
+
+   If OPTS begins with `--', then non-option arguments are treated as
+   arguments to the option '\0'.  This behavior is specific to the GNU
+   `getopt'.  */
+
+#if (defined __STDC__ && __STDC__) || defined __cplusplus
+# ifdef __GNU_LIBRARY__
+/* Many other libraries have conflicting prototypes for getopt, with
+   differences in the consts, in stdlib.h.  To avoid compilation
+   errors, only prototype getopt for the GNU C library.  */
+extern int getopt (int __argc, char *const *__argv, const char *__shortopts);
+# else /* not __GNU_LIBRARY__ */
+extern int getopt ();
+# endif /* __GNU_LIBRARY__ */
+
+# ifndef __need_getopt
+extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts,
+		        const struct option *__longopts, int *__longind);
+extern int getopt_long_only (int __argc, char *const *__argv,
+			     const char *__shortopts,
+		             const struct option *__longopts, int *__longind);
+
+/* Internal only.  Users should not call this directly.  */
+extern int _getopt_internal (int __argc, char *const *__argv,
+			     const char *__shortopts,
+		             const struct option *__longopts, int *__longind,
+			     int __long_only);
+# endif
+#else /* not __STDC__ */
+extern int getopt ();
+# ifndef __need_getopt
+extern int getopt_long ();
+extern int getopt_long_only ();
+
+extern int _getopt_internal ();
+# endif
+#endif /* __STDC__ */
+
+#ifdef	__cplusplus
+}
+#endif
+
+/* Make sure we later can get all the definitions and declarations.  */
+#undef __need_getopt
+
+#endif /* getopt.h */
diff --git a/extras/stdint.h b/extras/stdint.h
new file mode 100644
index 00000000..92dfa884
--- /dev/null
+++ b/extras/stdint.h
@@ -0,0 +1,186 @@
+/* ISO C9x  7.18  Integer types <stdint.h>
+ * Based on ISO/IEC SC22/WG14 9899 Committee draft (SC22 N2794)
+ *
+ *  THIS SOFTWARE IS NOT COPYRIGHTED
+ *
+ *  Contributor: Danny Smith <danny_r_smith_2001@yahoo.co.nz>
+ *
+ *  This source code is offered for use in the public domain. You may
+ *  use, modify or distribute it freely.
+ *
+ *  This code is distributed in the hope that it will be useful but
+ *  WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
+ *  DISCLAIMED. This includes but is not limited to warranties of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ *  Date: 2000-12-02
+ */
+
+
+#ifndef _STDINT_H
+#define _STDINT_H
+#define __need_wint_t
+#define __need_wchar_t
+#include <stddef.h>
+
+/* 7.18.1.1  Exact-width integer types */
+typedef signed char int8_t;
+typedef unsigned char   uint8_t;
+typedef short  int16_t;
+typedef unsigned short  uint16_t;
+typedef int  int32_t;
+typedef unsigned   uint32_t;
+typedef __int64  int64_t;
+typedef unsigned __int64 uint64_t;
+
+/* 7.18.1.2  Minimum-width integer types */
+typedef signed char int_least8_t;
+typedef unsigned char   uint_least8_t;
+typedef short  int_least16_t;
+typedef unsigned short  uint_least16_t;
+typedef int  int_least32_t;
+typedef unsigned   uint_least32_t;
+typedef __int64  int_least64_t;
+typedef unsigned __int64   uint_least64_t;
+
+/*  7.18.1.3  Fastest minimum-width integer types 
+ *  Not actually guaranteed to be fastest for all purposes
+ *  Here we use the exact-width types for 8 and 16-bit ints. 
+ */
+typedef char int_fast8_t;
+typedef unsigned char uint_fast8_t;
+typedef short  int_fast16_t;
+typedef unsigned short  uint_fast16_t;
+typedef int  int_fast32_t;
+typedef unsigned  int  uint_fast32_t;
+typedef __int64  int_fast64_t;
+typedef unsigned __int64   uint_fast64_t;
+
+/* 7.18.1.4  Integer types capable of holding object pointers */
+/*typedef int intptr_t;
+typedef unsigned uintptr_t;*/
+
+/* 7.18.1.5  Greatest-width integer types */
+typedef __int64  intmax_t;
+typedef unsigned __int64   uintmax_t;
+
+/* 7.18.2  Limits of specified-width integer types */
+#if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS)
+
+/* 7.18.2.1  Limits of exact-width integer types */
+#define INT8_MIN (-128) 
+#define INT16_MIN (-32768)
+#define INT32_MIN (-2147483647 - 1)
+#define INT64_MIN  (-9223372036854775807LL - 1)
+
+#define INT8_MAX 127
+#define INT16_MAX 32767
+#define INT32_MAX 2147483647
+#define INT64_MAX 9223372036854775807LL
+
+#define UINT8_MAX 0xff /* 255U */
+#define UINT16_MAX 0xffff /* 65535U */
+#define UINT32_MAX 0xffffffff  /* 4294967295U */
+#define UINT64_MAX 0xffffffffffffffffULL /* 18446744073709551615ULL */
+
+/* 7.18.2.2  Limits of minimum-width integer types */
+#define INT_LEAST8_MIN INT8_MIN
+#define INT_LEAST16_MIN INT16_MIN
+#define INT_LEAST32_MIN INT32_MIN
+#define INT_LEAST64_MIN INT64_MIN
+
+#define INT_LEAST8_MAX INT8_MAX
+#define INT_LEAST16_MAX INT16_MAX
+#define INT_LEAST32_MAX INT32_MAX
+#define INT_LEAST64_MAX INT64_MAX
+
+#define UINT_LEAST8_MAX UINT8_MAX
+#define UINT_LEAST16_MAX UINT16_MAX
+#define UINT_LEAST32_MAX UINT32_MAX
+#define UINT_LEAST64_MAX UINT64_MAX
+
+/* 7.18.2.3  Limits of fastest minimum-width integer types */
+#define INT_FAST8_MIN INT8_MIN
+#define INT_FAST16_MIN INT16_MIN
+#define INT_FAST32_MIN INT32_MIN
+#define INT_FAST64_MIN INT64_MIN
+
+#define INT_FAST8_MAX INT8_MAX
+#define INT_FAST16_MAX INT16_MAX
+#define INT_FAST32_MAX INT32_MAX
+#define INT_FAST64_MAX INT64_MAX
+
+#define UINT_FAST8_MAX UINT8_MAX
+#define UINT_FAST16_MAX UINT16_MAX
+#define UINT_FAST32_MAX UINT32_MAX
+#define UINT_FAST64_MAX UINT64_MAX
+
+/* 7.18.2.4  Limits of integer types capable of holding
+    object pointers */ 
+#define INTPTR_MIN INT32_MIN
+#define INTPTR_MAX INT32_MAX
+#define UINTPTR_MAX UINT32_MAX
+
+/* 7.18.2.5  Limits of greatest-width integer types */
+#define INTMAX_MIN INT64_MIN
+#define INTMAX_MAX INT64_MAX
+#define UINTMAX_MAX UINT64_MAX
+
+/* 7.18.3  Limits of other integer types */
+#define PTRDIFF_MIN INT32_MIN
+#define PTRDIFF_MAX INT32_MAX
+
+#define SIG_ATOMIC_MIN INT32_MIN
+#define SIG_ATOMIC_MAX INT32_MAX
+
+#define SIZE_MAX UINT32_MAX
+
+#ifndef WCHAR_MIN  /* also in wchar.h */ 
+#define WCHAR_MIN 0
+#define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */
+#endif
+
+/*
+ * wint_t is unsigned short for compatibility with MS runtime
+ */
+#define WINT_MIN 0
+#define WINT_MAX ((wint_t)-1) /* UINT16_MAX */
+
+#endif /* !defined ( __cplusplus) || defined __STDC_LIMIT_MACROS */
+
+
+/* 7.18.4  Macros for integer constants */
+#if !defined ( __cplusplus) || defined (__STDC_CONSTANT_MACROS)
+
+/* 7.18.4.1  Macros for minimum-width integer constants
+
+    Accoding to Douglas Gwyn <gwyn@arl.mil>:
+	"This spec was changed in ISO/IEC 9899:1999 TC1; in ISO/IEC
+	9899:1999 as initially published, the expansion was required
+	to be an integer constant of precisely matching type, which
+	is impossible to accomplish for the shorter types on most
+	platforms, because C99 provides no standard way to designate
+	an integer constant with width less than that of type int.
+	TC1 changed this to require just an integer constant
+	*expression* with *promoted* type."
+
+	The trick used here is from Clive D W Feather.
+*/
+
+#define INT8_C(val) (INT_LEAST8_MAX-INT_LEAST8_MAX+(val))
+#define INT16_C(val) (INT_LEAST16_MAX-INT_LEAST16_MAX+(val))
+#define INT32_C(val) (INT_LEAST32_MAX-INT_LEAST32_MAX+(val))
+#define INT64_C(val) (INT_LEAST64_MAX-INT_LEAST64_MAX+(val))
+
+#define UINT8_C(val) (UINT_LEAST8_MAX-UINT_LEAST8_MAX+(val))
+#define UINT16_C(val) (UINT_LEAST16_MAX-UINT_LEAST16_MAX+(val))
+#define UINT32_C(val) (UINT_LEAST32_MAX-UINT_LEAST32_MAX+(val))
+#define UINT64_C(val) (UINT_LEAST64_MAX-UINT_LEAST64_MAX+(val))
+
+/* 7.18.4.2  Macros for greatest-width integer constants */
+#define INTMAX_C(val) (INTMAX_MAX-INTMAX_MAX+(val))
+#define UINTMAX_C(val) (UINTMAX_MAX-UINTMAX_MAX+(val))
+
+#endif  /* !defined ( __cplusplus) || defined __STDC_CONSTANT_MACROS */
+
+#endif
diff --git a/testing/checkasm.c b/testing/checkasm.c
new file mode 100644
index 00000000..ec3283ba
--- /dev/null
+++ b/testing/checkasm.c
@@ -0,0 +1,347 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#
+#include "core/common.h"
+#ifdef HAVE_MMXEXT
+#include "core/i386/pixel.h"
+#include "core/i386/dct.h"
+#include "core/i386/mc.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#include "core/ppc/pixel.h"
+#endif
+
+/* buf1, buf2: initialised to randome data and shouldn't write into them */
+uint8_t * buf1, * buf2;
+/* buf3, buf4: used to store output */
+uint8_t * buf3, * buf4;
+
+static int check_pixel()
+{
+    x264_pixel_function_t pixel_c = {{0},{0},{0}};
+    x264_pixel_function_t pixel_asm = {{0}, {0},{0}};
+    int ret = 0, ok;
+    int i;
+
+    memset( &pixel_asm, 0, sizeof( x264_pixel_function_t ) );
+    x264_pixel_init( 0, &pixel_c );
+#ifdef HAVE_MMXEXT
+    x264_pixel_init( X264_CPU_MMX|X264_CPU_MMXEXT, &pixel_asm );
+#endif
+#ifdef HAVE_ALTIVEC
+    x264_pixel_altivec_init( &pixel_asm );
+#endif
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        int res_c, res_asm;
+        if( pixel_asm.sad[i] )
+        {
+            res_c   = pixel_c.sad[i]( buf1, 32, buf2, 32 );
+            res_asm =  pixel_asm.sad[i]( buf1, 32, buf2, 32 );
+            if( res_c != res_asm )
+            {
+                ok = 0;
+                fprintf( stderr, "sad[%d]: %d != %d [FAILED]\n", i, res_c, res_asm );
+            }
+        }
+    }
+    if( ok )
+        fprintf( stderr, " - pixel sad :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel sat :           [FAILED]\n" );
+    }
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        int res_c, res_asm;
+        if( pixel_asm.satd[i] )
+        {
+            res_c   = pixel_c.satd[i]( buf1, 32, buf2, 32 );
+            res_asm = pixel_asm.satd[i]( buf1, 32, buf2, 32 );
+            if( res_c != res_asm )
+            {
+                ok = 0;
+                fprintf( stderr, "satd[%d]: %d != %d [FAILED]\n", i, res_c, res_asm );
+            }
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - pixel satd :          [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel satd :          [FAILED]\n" );
+    }
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        if( pixel_asm.avg[i] )
+        {
+            memcpy( buf3, buf1, 32*32 );
+            memcpy( buf4, buf1, 32*32 );
+            pixel_c.satd[i]( buf3, 32, buf2, 32 );
+            pixel_asm.satd[i]( buf4, 32, buf2, 32 );
+            if( memcmp( buf3, buf4, 32*32 ) )
+            {
+                ok = 0;
+                fprintf( stderr, "avg[%d]: [FAILED]\n", i );
+            }
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - pixel avg :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel avg :           [FAILED]\n" );
+    }
+
+    return ret;
+}
+
+static int check_dct()
+{
+    x264_dct_function_t dct_c;
+    x264_dct_function_t dct_asm;
+    int ret = 0, ok;
+    int16_t dct1[16][4][4] __attribute((aligned(16)));
+    int16_t dct2[16][4][4] __attribute((aligned(16)));
+
+    memset( &dct_asm, 0, sizeof( dct_asm ) );
+    x264_dct_init( 0, &dct_c );
+#ifdef HAVE_MMXEXT
+    x264_dct_init( X264_CPU_MMX|X264_CPU_MMXEXT, &dct_asm );
+#endif
+#define TEST_DCT( name, t1, t2, size ) \
+    if( dct_asm.name ) \
+    { \
+        dct_c.name( t1, buf1, 32, buf2, 32 ); \
+        dct_asm.name( t2, buf1, 32, buf2, 32 ); \
+        if( memcmp( t1, t2, size ) ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, #name " [FAILED]\n" ); \
+        } \
+    }
+    ok = 1;
+    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
+    TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
+    TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
+    if( ok )
+        fprintf( stderr, " - sub_dctXxX :          [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - sub_dctXxX :          [FAILED]\n" );
+    }
+#undef TEST_DCT
+
+#define TEST_IDCT( name, t ) \
+    if( dct_asm.name ) \
+    { \
+        memcpy( buf3, buf1, 32*32 ); \
+        memcpy( buf4, buf1, 32*32 ); \
+        dct_c.name( buf3, 32, t ); \
+        dct_asm.name( buf4, 32, t ); \
+        if( memcmp( buf3, buf4, 32*32 ) ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, #name " [FAILED]\n" ); \
+        } \
+    }
+    ok = 1;
+    TEST_IDCT( add4x4_idct, dct1[0] );
+    TEST_IDCT( add8x8_idct, dct1 );
+    TEST_IDCT( add16x16_idct, dct1 );
+    if( ok )
+        fprintf( stderr, " - add_idctXxX :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - add_idctXxX :         [FAILED]\n" );
+    }
+#undef TEST_IDCT
+
+    ok = 1;
+    if( dct_asm.dct4x4dc )
+    {
+        int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+        int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+
+        dct_c.dct4x4dc( dct1 );
+        dct_asm.dct4x4dc( dct2 );
+        if( memcmp( dct1, dct2, 32 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - dct4x4dc :        [FAILED]\n" );
+        }
+    }
+    if( dct_asm.idct4x4dc )
+    {
+        int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+        int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+
+        dct_c.idct4x4dc( dct1 );
+        dct_asm.idct4x4dc( dct2 );
+        if( memcmp( dct1, dct2, 32 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - idct4x4dc :        [FAILED]\n" );
+        }
+    }
+    if( ok )
+        fprintf( stderr, " - (i)dct4x4dc :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - (i)dct4x4dc :         [FAILED]\n" );
+    }
+
+    ok = 1;
+    if( dct_asm.dct2x2dc )
+    {
+        int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+        int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+
+        dct_c.dct2x2dc( dct1 );
+        dct_asm.dct2x2dc( dct2 );
+        if( memcmp( dct1, dct2, 4*2 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - dct2x2dc :        [FAILED]\n" );
+        }
+    }
+    if( dct_asm.idct2x2dc )
+    {
+        int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+        int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+
+        dct_c.idct2x2dc( dct1 );
+        dct_asm.idct2x2dc( dct2 );
+        if( memcmp( dct1, dct2, 4*2 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - idct2x2dc :       [FAILED]\n" );
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - (i)dct2x2dc :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - (i)dct2x2dc :         [FAILED]\n" );
+    }
+
+
+    return ret;
+}
+
+static int check_mc()
+{
+    x264_mc_function_t mc_c[2] = {0};
+    x264_mc_function_t mc_asm[2] = {0};
+    uint8_t *src = &buf1[2*32+2];
+    uint8_t *dst1 = &buf3[2*32+2];
+    uint8_t *dst2 = &buf4[2*32+2];
+    int dx, dy;
+    int ret = 0, ok[2] = { 1, 1 };
+
+    x264_mc_init( 0, mc_c );
+#ifdef HAVE_MMXEXT
+    x264_mc_mmxext_init( mc_asm );
+#endif
+
+    memset( buf3, 0, 32*32 );
+    memset( buf4, 0, 32*32 );
+
+    /* Do the MC */
+#define MC_TEST( t, w, h ) \
+        if( mc_asm[t] ) \
+        { \
+            memset(dst1, 0xCD, (h) * 16); \
+            mc_c[t]( src, 32, dst1, 16, dx, dy, w, h );     \
+            memset(dst2, 0xCD, (h) * 16); \
+            mc_asm[t]( src, 32, dst2, 16, dx, dy, w, h );   \
+            if( memcmp( dst1, dst2, 16*16 ) )               \
+            { \
+                fprintf( stderr, "mc["#t"][mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h );   \
+                ok[t] = 0; \
+            } \
+        }
+
+    for( dy = 0; dy < 4; dy++ )
+    {
+        for( dx = 0; dx < 4; dx++ )
+        {
+            MC_TEST( 0, 16, 16 );
+            MC_TEST( 0, 16, 8 );
+            MC_TEST( 0, 8, 16 );
+            MC_TEST( 0, 8, 8 );
+            MC_TEST( 0, 8, 4 );
+            MC_TEST( 0, 4, 8 );
+            MC_TEST( 0, 4, 4 );
+
+            MC_TEST( 1, 8, 8 );
+            MC_TEST( 1, 8, 4 );
+            MC_TEST( 1, 4, 8 );
+            MC_TEST( 1, 4, 4 );
+            MC_TEST( 1, 4, 2 );
+            MC_TEST( 1, 2, 4 );
+            MC_TEST( 1, 2, 2 );
+        }
+    }
+#undef MC_TEST
+    if( ok[0] )
+        fprintf( stderr, " - mc luma :             [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - mc luma :             [FAILED]\n" );
+    }
+    if( ok[1] )
+        fprintf( stderr, " - mc chroma :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - mc chroma :           [FAILED]\n" );
+    }
+    return ret;
+}
+
+int main()
+{
+    int ret;
+    int i;
+
+#ifdef HAVE_MMXEXT
+    fprintf( stderr, "x264: MMXEXT against C\n" );
+#elif HAVE_ALTIVEC
+    fprintf( stderr, "x264: ALTIVEC against C\n" );
+#endif
+
+    buf1 = x264_malloc( 1024 ); /* 32 x 32 */
+    buf2 = x264_malloc( 1024 );
+    buf3 = x264_malloc( 1024 );
+    buf4 = x264_malloc( 1024 );
+
+    srand( x264_mdate() );
+
+    for( i = 0; i < 1024; i++ )
+    {
+        buf1[i] = rand() % 0xFF;
+        buf2[i] = rand() % 0xFF;
+        buf3[i] = buf4[i] = 0;
+    }
+
+    ret = check_pixel() +
+          check_dct() +
+          check_mc();
+
+    if( ret == 0 )
+    {
+        fprintf( stderr, "x264: All tests passed Yeah :)\n" );
+        return 0;
+    }
+    fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
+    return -1;
+}
+
diff --git a/testing/edge-detec.c b/testing/edge-detec.c
new file mode 100644
index 00000000..e78c71a6
--- /dev/null
+++ b/testing/edge-detec.c
@@ -0,0 +1,2733 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: edge-detec.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "common.h"
+#include "me.h"
+#include "vlc.h"
+
+static inline int x264_median( int a, int b, int c )
+{
+    int min = a, max =a;
+    if( b < min )
+    {
+        min = b;
+    }
+    else
+    {
+        max = b;    /* no need to do 'b > max' (more consuming than always doing affectation) */
+    }
+    if( c < min )
+    {
+        min = c;
+    }
+    else if( c > max )
+    {
+        max = c;
+    }
+
+    return a + b + c - min - max;
+}
+
+static const uint8_t intra4x4_cbp_to_golomb[48]=
+{
+  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+ 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+ 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
+};
+static const uint8_t inter_cbp_to_golomb[48]=
+{
+  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+};
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int quant_mf[6][4][4] =
+{
+    {  { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243},
+       { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}  },
+    {  { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660},
+       { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}  },
+    {  { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194},
+       { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}  },
+    {  {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647},
+       {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}  },
+    {  {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355},
+       {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}  },
+    {  {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893},
+       {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}  }
+};
+
+static const int dequant_mf[6][4][4] =
+{
+    { {10, 13, 10, 13}, {13, 16, 13, 16}, {10, 13, 10, 13}, {13, 16, 13, 16} },
+    { {11, 14, 11, 14}, {14, 18, 14, 18}, {11, 14, 11, 14}, {14, 18, 14, 18} },
+    { {13, 16, 13, 16}, {16, 20, 16, 20}, {13, 16, 13, 16}, {16, 20, 16, 20} },
+    { {14, 18, 14, 18}, {18, 23, 18, 23}, {14, 18, 14, 18}, {18, 23, 18, 23} },
+    { {16, 20, 16, 20}, {20, 25, 20, 25}, {16, 20, 16, 20}, {20, 25, 20, 25} },
+    { {18, 23, 18, 23}, {23, 29, 23, 29}, {18, 23, 18, 23}, {23, 29, 23, 29} }
+};
+
+
+static int predict_pred_intra4x4_mode( x264_t *h, x264_macroblock_t *mb, int idx )
+{
+    x264_macroblock_t *mba = mb->context->block[idx].mba;
+    x264_macroblock_t *mbb = mb->context->block[idx].mbb;
+
+    int i_mode_a = I_PRED_4x4_DC;
+    int i_mode_b = I_PRED_4x4_DC;
+
+    if( !mba || !mbb )
+    {
+        return I_PRED_4x4_DC;
+    }
+
+    if( mba->i_type == I_4x4 )
+    {
+        i_mode_a = mb->context->block[idx].bka->i_intra4x4_pred_mode;
+    }
+    if( mbb->i_type == I_4x4 )
+    {
+        i_mode_b = mb->context->block[idx].bkb->i_intra4x4_pred_mode;
+    }
+
+    return X264_MIN( i_mode_a, i_mode_b );
+}
+
+static int predict_non_zero_code( x264_t *h, x264_macroblock_t *mb, int idx )
+{
+    x264_macroblock_t *mba = mb->context->block[idx].mba;
+    x264_macroblock_t *mbb = mb->context->block[idx].mbb;
+
+    int i_z_a = 0x80, i_z_b = 0x80;
+    int i_ret;
+
+    /* none avail -> 0, one avail -> this one, both -> (a+b+1)>>1 */
+    if( mba )
+    {
+        i_z_a = mb->context->block[idx].bka->i_non_zero_count;
+    }
+    if( mbb )
+    {
+        i_z_b = mb->context->block[idx].bkb->i_non_zero_count;
+    }
+
+    i_ret = i_z_a+i_z_b;
+    if( i_ret < 0x80 )
+    {
+        i_ret = ( i_ret + 1 ) >> 1;
+    }
+    return i_ret & 0x7f;
+}
+
+
+/*
+ * Handle intra mb
+ */
+/* Max = 4 */
+static void predict_16x16_mode_available( x264_macroblock_t *mb, int *mode, int *pi_count )
+{
+    if( ( mb->i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        /* top and left avaible */
+        *mode++ = I_PRED_16x16_DC;
+        *mode++ = I_PRED_16x16_V;
+        *mode++ = I_PRED_16x16_H;
+        *mode++ = I_PRED_16x16_P;
+        *pi_count = 4;
+    }
+    else if( ( mb->i_neighbour & MB_LEFT ) )
+    {
+        /* left available*/
+        *mode++ = I_PRED_16x16_DC_LEFT;
+        *mode++ = I_PRED_16x16_H;
+        *pi_count = 2;
+    }
+    else if( ( mb->i_neighbour & MB_TOP ) )
+    {
+        /* top available*/
+        *mode++ = I_PRED_16x16_DC_TOP;
+        *mode++ = I_PRED_16x16_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        /* none avaible */
+        *mode = I_PRED_16x16_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/* Max = 4 */
+static void predict_8x8_mode_available( x264_macroblock_t *mb, int *mode, int *pi_count )
+{
+    if( ( mb->i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        /* top and left avaible */
+        *mode++ = I_PRED_CHROMA_DC;
+        *mode++ = I_PRED_CHROMA_V;
+        *mode++ = I_PRED_CHROMA_H;
+        *mode++ = I_PRED_CHROMA_P;
+        *pi_count = 4;
+    }
+    else if( ( mb->i_neighbour & MB_LEFT ) )
+    {
+        /* left available*/
+        *mode++ = I_PRED_CHROMA_DC_LEFT;
+        *mode++ = I_PRED_CHROMA_H;
+        *pi_count = 2;
+    }
+    else if( ( mb->i_neighbour & MB_TOP ) )
+    {
+        /* top available*/
+        *mode++ = I_PRED_CHROMA_DC_TOP;
+        *mode++ = I_PRED_CHROMA_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        /* none avaible */
+        *mode = I_PRED_CHROMA_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/* MAX = 8 */
+static void predict_4x4_mode_available( x264_macroblock_t *mb, int idx, int *mode, int *pi_count )
+{
+    int b_a, b_b, b_c;
+    static const int needmb[16] =
+    {
+        MB_LEFT|MB_TOP, MB_TOP,
+        MB_LEFT,        MB_PRIVATE,
+        MB_TOP,         MB_TOP|MB_TOPRIGHT,
+        0,              MB_PRIVATE,
+        MB_LEFT,        0,
+        MB_LEFT,        MB_PRIVATE,
+        0,              MB_PRIVATE,
+        0,              MB_PRIVATE
+    };
+
+    /* FIXME even when b_c == 0 there is some case where missing pixels
+     * are emulated and thus more mode are available TODO
+     * analysis and encode should be fixed too */
+    b_a = (needmb[idx]&mb->i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
+    b_b = (needmb[idx]&mb->i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
+    b_c = (needmb[idx]&mb->i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
+
+    if( b_a && b_b )
+    {
+        *mode++ = I_PRED_4x4_DC;
+        *mode++ = I_PRED_4x4_H;
+        *mode++ = I_PRED_4x4_V;
+        *mode++ = I_PRED_4x4_DDR;
+        *mode++ = I_PRED_4x4_VR;
+        *mode++ = I_PRED_4x4_HD;
+        *mode++ = I_PRED_4x4_HU;
+
+        *pi_count = 7;
+
+        if( b_c )
+        {
+            *mode++ = I_PRED_4x4_DDL;
+            *mode++ = I_PRED_4x4_VL;
+            (*pi_count) += 2;
+        }
+    }
+    else if( b_a && !b_b )
+    {
+        *mode++ = I_PRED_4x4_DC_LEFT;
+        *mode++ = I_PRED_4x4_H;
+        *pi_count = 2;
+    }
+    else if( !b_a && b_b )
+    {
+        *mode++ = I_PRED_4x4_DC_TOP;
+        *mode++ = I_PRED_4x4_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        *mode++ = I_PRED_4x4_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+}
+static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
+{
+    int i;
+
+    for( i = 1; i < 16; i++ )
+    {
+        level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+}
+
+static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[1][1];
+}
+
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    int i_qbits = 15 + i_qscale / 6;
+    int i_mf = i_qscale % 6;
+    int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    int i_qbits = 15 + i_qscale / 6;
+    int i_mf = i_qscale % 6;
+    int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( 2*f + (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][0][0] ) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( 2*f - (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][0][0] ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int i_qbits = 15 + i_qscale / 6;
+    int i_mf = i_qscale % 6;
+    int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            /* XXX: is int64_t really needed ? */
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( 2*f + (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][0][0] ) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( 2*f - (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][0][0] ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static void dequant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    int i_mf = i_qscale%6;
+    int i_qbits = i_qscale/6;
+    int f;
+    int x,y;
+
+    if( i_qbits <= 1 )
+    {
+        f = 1 << ( 1 - i_qbits );
+    }
+    else
+    {
+        f = 0;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( i_qbits >= 2 )
+            {
+                dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][0][0] ) << (i_qbits - 2);
+            }
+            else
+            {
+                dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][0][0] + f ) >> ( 2 -i_qbits );
+            }
+        }
+    }
+}
+
+static void dequant_2x2_dc( int16_t dct[2][2], int i_qscale )
+{
+    int i_mf = i_qscale%6;
+    int i_qbits = i_qscale/6;
+    int x,y;
+
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( i_qbits >= 1 )
+            {
+                dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][0][0] ) << (i_qbits - 1);
+            }
+            else
+            {
+                dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][0][0] ) >> 1;
+            }
+        }
+    }
+}
+static void dequant_4x4( int16_t dct[4][4], int i_qscale )
+{
+    int i_mf = i_qscale%6;
+    int i_qbits = i_qscale/6;
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][x][y] ) << i_qbits;
+        }
+    }
+}
+
+static inline int array_non_zero_count( int *v, int i_count )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < i_count; i++ )
+    {
+        if( v[i] )
+        {
+            i_nz++;
+        }
+    }
+    return i_nz;
+}
+
+/* TODO : use a table instead */
+static int mb_partition_count( int i_partition )
+{
+    switch( i_partition )
+    {
+        case D_8x8:
+            return 4;
+        case D_16x8:
+        case D_8x16:
+            return 2;
+        case D_16x16:
+            return 1;
+        default:
+            /* should never occur */
+            return 0;
+    }
+}
+
+static int mb_sub_partition_count( int i_partition )
+{
+    switch( i_partition )
+    {
+        case D_L0_4x4:
+        case D_L1_4x4:
+        case D_BI_4x4:
+            return 4;
+        case D_L0_4x8:
+        case D_L1_4x8:
+        case D_BI_4x8:
+        case D_L0_8x4:
+        case D_L1_8x4:
+        case D_BI_8x4:
+            return 2;
+        case D_L0_8x8:
+        case D_L1_8x8:
+        case D_BI_8x8:
+        case D_DIRECT_8x8:
+            return 1;
+        default:
+            /* should never occur */
+            return 0;
+    }
+}
+
+static inline void x264_macroblock_partition_getxy( x264_macroblock_t *mb, int i_part, int i_sub, int *x, int *y )
+{
+    if( mb->i_partition == D_16x16 )
+    {
+        *x  = 0;
+        *y  = 0;
+    }
+    else if( mb->i_partition == D_16x8 )
+    {
+        *x = 0;
+        *y = 2*i_part;
+    }
+    else if( mb->i_partition == D_8x16 )
+    {
+        *x = 2*i_part;
+        *y = 0;
+    }
+    else if( mb->i_partition == D_8x8 )
+    {
+        *x = 2 * (i_part%2);
+        *y = 2 * (i_part/2);
+
+        if( IS_SUB4x4( mb->i_sub_partition[i_part] ) )
+        {
+            (*x) += i_sub%2;
+            (*y) += i_sub/2;
+        }
+        else if( IS_SUB4x8( mb->i_sub_partition[i_part] ) )
+        {
+            (*x) += i_sub;
+        }
+        else if( IS_SUB8x4( mb->i_sub_partition[i_part] ) )
+        {
+            (*y) += i_sub;
+        }
+    }
+}
+static inline void x264_macroblock_partition_size( x264_macroblock_t *mb, int i_part, int i_sub, int *w, int *h )
+{
+    if( mb->i_partition == D_16x16 )
+    {
+        *w  = 4;
+        *h  = 4;
+    }
+    else if( mb->i_partition == D_16x8 )
+    {
+        *w = 4;
+        *h = 2;
+    }
+    else if( mb->i_partition == D_8x16 )
+    {
+        *w = 2;
+        *h = 4;
+    }
+    else if( mb->i_partition == D_8x8 )
+    {
+        if( IS_SUB4x4( mb->i_sub_partition[i_part] ) )
+        {
+            *w = 1;
+            *h = 1;
+        }
+        else if( IS_SUB4x8( mb->i_sub_partition[i_part] ) )
+        {
+            *w = 1;
+            *h = 2;
+        }
+        else if( IS_SUB8x4( mb->i_sub_partition[i_part] ) )
+        {
+            *w = 2;
+            *h = 1;
+        }
+        else
+        {
+            *w = 2;
+            *h = 2;
+        }
+    }
+}
+
+void x264_macroblock_partition_set( x264_macroblock_t *mb, int i_list, int i_part, int i_sub, int i_ref, int mx, int my )
+{
+    int x,  y;
+    int w,  h;
+    int dx, dy;
+
+    x264_macroblock_partition_getxy( mb, i_part, i_sub, &x, &y );
+    x264_macroblock_partition_size ( mb, i_part, i_sub, &w, &h );
+
+    for( dx = 0; dx < w; dx++ )
+    {
+        for( dy = 0; dy < h; dy++ )
+        {
+            mb->partition[x+dx][y+dy].i_ref[i_list] = i_ref;
+            mb->partition[x+dx][y+dy].mv[i_list][0] = mx;
+            mb->partition[x+dx][y+dy].mv[i_list][1] = my;
+        }
+    }
+}
+
+void x264_macroblock_partition_get( x264_macroblock_t *mb, int i_list, int i_part, int i_sub, int *pi_ref, int *pi_mx, int *pi_my )
+{
+    int x,y;
+
+    x264_macroblock_partition_getxy( mb, i_part, i_sub, &x, &y );
+
+    if( pi_ref )
+    {
+        *pi_ref = mb->partition[x][y].i_ref[i_list];
+    }
+    if( pi_mx && pi_my )
+    {
+        *pi_mx  = mb->partition[x][y].mv[i_list][0];
+        *pi_my  = mb->partition[x][y].mv[i_list][1];
+    }
+}
+
+/* ARrrrg so unbeautifull, and unoptimised for common case */
+void x264_macroblock_predict_mv( x264_macroblock_t *mb, int i_list, int i_part, int i_subpart, int *mvxp, int *mvyp )
+{
+    int x, y, xn, yn;
+    int w, h;
+    int i_ref;
+
+    int i_refa = -1;
+    int i_refb = -1;
+    int i_refc = -1;
+
+    int mvxa = 0, mvxb = 0, mvxc = 0;
+    int mvya = 0, mvyb = 0, mvyc = 0;
+
+    x264_macroblock_t *mbn;
+
+
+    x264_macroblock_partition_getxy( mb, i_part, i_subpart, &x, &y );
+    x264_macroblock_partition_size( mb, i_part, i_subpart, &w, &h );
+    i_ref = mb->partition[x][y].i_ref[i_list];
+
+    /* Left  pixel (-1,0)*/
+    xn = x - 1;
+    mbn = mb;
+    if( xn < 0 )
+    {
+        xn += 4;
+        mbn = mb->mba;
+    }
+    if( mbn )
+    {
+        i_refa = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refa = mbn->partition[xn][y].i_ref[i_list];
+            mvxa   = mbn->partition[xn][y].mv[i_list][0];
+            mvya   = mbn->partition[xn][y].mv[i_list][1];
+        }
+    }
+
+    /* Up ( pixel(0,-1)*/
+    yn = y - 1;
+    mbn = mb;
+    if( yn < 0 )
+    {
+        yn += 4;
+        mbn = mb->mbb;
+    }
+    if( mbn )
+    {
+        i_refb = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refb = mbn->partition[x][yn].i_ref[i_list];
+            mvxb   = mbn->partition[x][yn].mv[i_list][0];
+            mvyb   = mbn->partition[x][yn].mv[i_list][1];
+        }
+    }
+
+    /* Up right pixel(width,-1)*/
+    xn = x + w;
+    yn = y - 1;
+
+    mbn = mb;
+    if( yn < 0 && xn >= 4 )
+    {
+        if( mb->mbc )
+        {
+            xn -= 4;
+            yn += 4;
+            mbn = mb->mbc;
+        }
+        else
+        {
+            mbn = NULL;
+        }
+    }
+    else if( yn < 0 )
+    {
+        yn += 4;
+        mbn = mb->mbb;
+    }
+    else if( xn >= 4 || ( xn == 2 && ( yn == 0 || yn == 2 ) ) )
+    {
+        mbn = NULL; /* not yet decoded */
+    }
+
+    if( mbn == NULL )
+    {
+        /* load top left pixel(-1,-1) */
+        xn = x - 1;
+        yn = y - 1;
+
+        mbn = mb;
+        if( yn < 0 && xn < 0 )
+        {
+            if( mb->mba && mb->mbb )
+            {
+                xn += 4;
+                yn += 4;
+                mbn = mb->mbb - 1;
+            }
+            else
+            {
+                mbn = NULL;
+            }
+        }
+        else if( yn < 0 )
+        {
+            yn += 4;
+            mbn = mb->mbb;
+        }
+        else if( xn < 0 )
+        {
+            xn += 4;
+            mbn = mb->mba;
+        }
+    }
+
+    if( mbn )
+    {
+        i_refc = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refc = mbn->partition[xn][yn].i_ref[i_list];
+            mvxc   = mbn->partition[xn][yn].mv[i_list][0];
+            mvyc   = mbn->partition[xn][yn].mv[i_list][1];
+        }
+    }
+
+    if( mb->i_partition == D_16x8 && i_part == 0 && i_refb == i_ref )
+    {
+        *mvxp = mvxb;
+        *mvyp = mvyb;
+    }
+    else if( mb->i_partition == D_16x8 && i_part == 1 && i_refa == i_ref )
+    {
+        *mvxp = mvxa;
+        *mvyp = mvya;
+    }
+    else if( mb->i_partition == D_8x16 && i_part == 0 && i_refa == i_ref )
+    {
+        *mvxp = mvxa;
+        *mvyp = mvya;
+    }
+    else if( mb->i_partition == D_8x16 && i_part == 1 && i_refc == i_ref )
+    {
+        *mvxp = mvxc;
+        *mvyp = mvyc;
+    }
+    else
+    {
+        int i_count;
+
+        i_count = 0;
+        if( i_refa == i_ref ) i_count++;
+        if( i_refb == i_ref ) i_count++;
+        if( i_refc == i_ref ) i_count++;
+
+        if( i_count > 1 )
+        {
+            *mvxp = x264_median( mvxa, mvxb, mvxc );
+            *mvyp = x264_median( mvya, mvyb, mvyc );
+        }
+        else if( i_count == 1 )
+        {
+            if( i_refa == i_ref )
+            {
+                *mvxp = mvxa;
+                *mvyp = mvya;
+            }
+            else if( i_refb == i_ref )
+            {
+                *mvxp = mvxb;
+                *mvyp = mvyb;
+            }
+            else
+            {
+                *mvxp = mvxc;
+                *mvyp = mvyc;
+            }
+        }
+        else if( i_refb == -1 && i_refc == -1 && i_refa != -1 )
+        {
+            *mvxp = mvxa;
+            *mvyp = mvya;
+        }
+        else
+        {
+            *mvxp = x264_median( mvxa, mvxb, mvxc );
+            *mvyp = x264_median( mvya, mvyb, mvyc );
+        }
+    }
+}
+
+void x264_macroblock_predict_mv_pskip( x264_macroblock_t *mb, int *mvxp, int *mvyp )
+{
+    int x, y, xn, yn;
+
+    int i_refa = -1;
+    int i_refb = -1;
+
+    int mvxa = 0, mvxb = 0;
+    int mvya = 0, mvyb = 0;
+
+    x264_macroblock_t *mbn;
+
+
+    x264_macroblock_partition_getxy( mb, 0, 0, &x, &y );
+
+    /* Left  pixel (-1,0)*/
+    xn = x - 1;
+    mbn = mb;
+    if( xn < 0 )
+    {
+        xn += 4;
+        mbn = mb->mba;
+    }
+    if( mbn )
+    {
+        i_refa = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refa = mbn->partition[xn][y].i_ref[0];
+            mvxa   = mbn->partition[xn][y].mv[0][0];
+            mvya   = mbn->partition[xn][y].mv[0][1];
+        }
+    }
+
+    /* Up ( pixel(0,-1)*/
+    yn = y - 1;
+    mbn = mb;
+    if( yn < 0 )
+    {
+        yn += 4;
+        mbn = mb->mbb;
+    }
+    if( mbn )
+    {
+        i_refb = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refb = mbn->partition[x][yn].i_ref[0];
+            mvxb   = mbn->partition[x][yn].mv[0][0];
+            mvyb   = mbn->partition[x][yn].mv[0][1];
+        }
+    }
+
+    if( i_refa == -1 || i_refb == -1 ||
+        ( i_refa == 0 && mvxa == 0 && mvya == 0 ) ||
+        ( i_refb == 0 && mvxb == 0 && mvyb == 0 ) )
+    {
+        *mvxp = 0;
+        *mvyp = 0;
+    }
+    else
+    {
+        x264_macroblock_predict_mv( mb, 0, 0, 0, mvxp, mvyp );
+    }
+}
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+static void x264_macroblock_mc( x264_t *h, x264_macroblock_t *mb, int b_luma )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    int ch;
+    int i_ref;
+    int mx, my;
+
+    if( mb->i_type == P_L0 )
+    {
+        int i_part;
+
+        for( i_part = 0; i_part < mb_partition_count( mb->i_partition ); i_part++ )
+        {
+            int i_width, i_height;
+            int x, y;
+
+            x264_macroblock_partition_get( mb, 0, i_part, 0, &i_ref, &mx, &my );
+            x264_macroblock_partition_getxy( mb, i_part, 0, &x, &y );
+            x264_macroblock_partition_size(  mb, i_part, 0, &i_width, &i_height );
+
+            if( b_luma )
+            {
+                int     i_src = ctx->i_fref0[i_ref][0];
+                uint8_t *p_src= ctx->p_fref0[i_ref][0];
+                int     i_dst = ctx->i_fdec[0];
+                uint8_t *p_dst= ctx->p_fdec[0];
+
+                h->mc[MC_LUMA]( &p_src[4*(x+y*i_src)], i_src,
+                                &p_dst[4*(x+y*i_dst)], i_dst,
+                                mx, my, 4*i_width, 4*i_height );
+            }
+            else
+            {
+                int     i_src,  i_dst;
+                uint8_t *p_src, *p_dst;
+
+                for( ch = 0; ch < 2; ch++ )
+                {
+                    i_src = ctx->i_fref0[i_ref][1+ch];
+                    p_src = ctx->p_fref0[i_ref][1+ch];
+                    i_dst = ctx->i_fdec[1+ch];
+                    p_dst = ctx->p_fdec[1+ch];
+
+                    h->mc[MC_CHROMA]( &p_src[2*(x+y*i_src)], i_src,
+                                      &p_dst[2*(x+y*i_dst)], i_dst,
+                                      mx, my, 2*i_width, 2*i_height );
+                }
+            }
+        }
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        int i_part;
+
+        for( i_part = 0; i_part < 4; i_part++ )
+        {
+            int i_sub;
+
+            for( i_sub = 0; i_sub < mb_sub_partition_count( mb->i_sub_partition[i_part] ); i_sub++ )
+            {
+                int i_width, i_height;
+                int x, y;
+
+                x264_macroblock_partition_get(   mb, 0, i_part, i_sub, &i_ref, &mx, &my );
+                x264_macroblock_partition_getxy( mb, i_part, i_sub, &x, &y );
+                x264_macroblock_partition_size(  mb, i_part, i_sub, &i_width, &i_height );
+
+                if( b_luma )
+                {
+                    int     i_src = ctx->i_fref0[i_ref][0];
+                    uint8_t *p_src= ctx->p_fref0[i_ref][0];
+                    int     i_dst = ctx->i_fdec[0];
+                    uint8_t *p_dst= ctx->p_fdec[0];
+
+                    h->mc[MC_LUMA]( &p_src[4*(x+y*i_src)], i_src,
+                                    &p_dst[4*(x+y*i_dst)], i_dst,
+                                    mx, my, 4*i_width, 4*i_height );
+                }
+                else
+                {
+                    int     i_src,  i_dst;
+                    uint8_t *p_src, *p_dst;
+
+                    for( ch = 0; ch < 2; ch++ )
+                    {
+                        i_src = ctx->i_fref0[i_ref][1+ch];
+                        p_src = ctx->p_fref0[i_ref][1+ch];
+                        i_dst = ctx->i_fdec[1+ch];
+                        p_dst = ctx->p_fdec[1+ch];
+
+                        h->mc[MC_CHROMA]( &p_src[2*(x+y*i_src)], i_src,
+                                          &p_dst[2*(x+y*i_dst)], i_dst,
+                                          mx, my, 2*i_width, 2*i_height );
+                    }
+                }
+            }
+        }
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_neighbour_load:
+ *****************************************************************************/
+void x264_macroblock_context_load( x264_t *h, x264_macroblock_t *mb, x264_mb_context_t *context )
+{
+    int i;
+    int x, y;
+    x264_macroblock_t *a = NULL;
+    x264_macroblock_t *b = NULL;
+
+    if( mb->i_neighbour&MB_LEFT )
+    {
+        a = mb - 1;
+    }
+    if( mb->i_neighbour&MB_TOP )
+    {
+        b = mb - h->sps.i_mb_width;
+    }
+#define LOAD_PTR( dst, src ) \
+    context->p_##dst[0] = (src)->plane[0] + 16 * ( mb->i_mb_x + mb->i_mb_y * (src)->i_stride[0] ); \
+    context->p_##dst[1] = (src)->plane[1] +  8 * ( mb->i_mb_x + mb->i_mb_y * (src)->i_stride[1] ); \
+    context->p_##dst[2] = (src)->plane[2] +  8 * ( mb->i_mb_x + mb->i_mb_y * (src)->i_stride[2] ); \
+    context->i_##dst[0] = (src)->i_stride[0]; \
+    context->i_##dst[1] = (src)->i_stride[1]; \
+    context->i_##dst[2] = (src)->i_stride[2]
+
+    LOAD_PTR( img,  h->picture );
+    LOAD_PTR( fdec, h->fdec );
+    for( i = 0; i < h->i_ref0; i++ )
+    {
+        LOAD_PTR( fref0[i], h->fref0[i] );
+    }
+    for( i = 0; i < h->i_ref1; i++ )
+    {
+        LOAD_PTR( fref1[i], h->fref1[i] );
+    }
+#undef LOAD_PTR
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            int idx;
+            int xa, yb;
+            x264_macroblock_t *mba;
+            x264_macroblock_t *mbb;
+
+            idx = block_idx_xy[x][y];
+            mba = mb;
+            mbb = mb;
+
+            xa = x - 1;
+            if (xa < 0 )
+            {
+                xa += 4;
+                mba = a;
+            }
+            /* up */
+            yb = y - 1;
+            if (yb < 0 )
+            {
+                yb += 4;
+                mbb = b;
+            }
+
+            context->block[idx].mba = mba;
+            context->block[idx].mbb = mbb;
+            context->block[idx].bka = mba ? &mba->block[block_idx_xy[xa][y]] : NULL;
+            context->block[idx].bkb = mbb ? &mbb->block[block_idx_xy[x][yb]] : NULL;
+
+            if( x < 2 && y < 2 )
+            {
+                int ch;
+                if( xa > 1 ) xa -= 2;   /* we have wrap but here step is 2 not 4 */
+                if( yb > 1 ) yb -= 2;   /* idem */
+
+                for( ch = 0; ch < 2; ch++ )
+                {
+                    context->block[16+4*ch+idx].mba = mba;
+                    context->block[16+4*ch+idx].mbb = mbb;
+                    context->block[16+4*ch+idx].bka = mba ? &mba->block[16+4*ch+block_idx_xy[xa][y]] : NULL;
+                    context->block[16+4*ch+idx].bkb = mbb ? &mbb->block[16+4*ch+block_idx_xy[x][yb]] : NULL;
+                }
+            }
+        }
+    }
+
+    mb->context = context;
+}
+
+/* (ref: JVT-B118)
+ * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
+ * to 0 (low score means set it to null)
+ * Used in inter macroblock (luma and chroma)
+ *  luma: for a 8x8 block: if score < 4 -> null
+ *        for the complete mb: if score < 6 -> null
+ *  chroma: for the complete mb: if score < 7 -> null
+ */
+static int x264_mb_decimate_score( int *dct, int i_max )
+{
+    static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int i_score = 0;
+    int idx = i_max - 1;
+
+    while( idx >= 0 && dct[idx] == 0 )
+    {
+        idx--;
+    }
+
+    while( idx >= 0 )
+    {
+        int i_run;
+
+        if( abs( dct[idx--] ) > 1 )
+        {
+            return 9;
+        }
+
+        i_run = 0;
+        while( idx >= 0 && dct[idx] == 0 )
+        {
+            idx--;
+            i_run++;
+        }
+        i_score += i_ds_table[i_run];
+    }
+
+    return i_score;
+}
+
+static void x264_mb_encode_4x4( x264_t *h, x264_macroblock_t *mb, int idx, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src = ctx->p_img[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_img[0];
+    int      i_src = ctx->i_img[0];
+    uint8_t *p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+    int      i_dst = ctx->i_fdec[0];
+
+    int16_t luma[4][4];
+    int16_t dct4x4[4][4];
+
+    /* we calculate diff */
+    h->pixf.sub4x4( luma, p_src, i_src, p_dst, i_dst );
+
+    /* calculate dct coeffs */
+    h->dctf.dct4x4( dct4x4, luma );
+    quant_4x4( dct4x4, i_qscale, 1 );
+
+    scan_zigzag_4x4full( mb->block[idx].luma4x4, dct4x4 );
+
+    /* output samples to fdec */
+    dequant_4x4( dct4x4, i_qscale );
+    h->dctf.idct4x4( luma, dct4x4 );
+
+    /* put pixel to fdec */
+    h->pixf.add4x4( p_dst, i_dst, luma );
+}
+
+static void x264_mb_encode_i16x16( x264_t *h, x264_macroblock_t *mb, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src = ctx->p_img[0];
+    int      i_src = ctx->i_img[0];
+    uint8_t *p_dst = ctx->p_fdec[0];
+    int      i_dst = ctx->i_fdec[0];
+
+    int16_t luma[16][4][4];
+    int16_t dct4x4[16+1][4][4];
+
+    int i;
+
+    /* calculate the diff */
+    h->pixf.sub16x16( luma, p_src, i_src, p_dst, i_dst );
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        h->dctf.dct4x4( dct4x4[i+1], luma[i] );
+
+        /* copy dc coeff */
+        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
+
+        quant_4x4( dct4x4[1+i], i_qscale, 1 );
+        scan_zigzag_4x4( mb->block[i].residual_ac, dct4x4[1+i] );
+    }
+
+    h->dctf.dct4x4dc( dct4x4[0], dct4x4[0] );
+    quant_4x4_dc( dct4x4[0], i_qscale, 1 );
+    scan_zigzag_4x4full( mb->luma16x16_dc, dct4x4[0] );
+
+    /* output samples to fdec */
+    h->dctf.idct4x4dc( dct4x4[0], dct4x4[0] );
+    dequant_4x4_dc( dct4x4[0], i_qscale );  /* XXX not inversed */
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        dequant_4x4( dct4x4[1+i], i_qscale );
+
+        /* copy dc coeff */
+        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+
+        h->dctf.idct4x4( luma[i], dct4x4[i+1] );
+    }
+    /* put pixels to fdec */
+    h->pixf.add16x16( p_dst, i_dst, luma );
+}
+
+static void x264_mb_encode_8x8( x264_t *h, x264_macroblock_t *mb, int b_inter, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src, *p_dst;
+    int      i_src, i_dst;
+
+    int i, ch;
+    int i_decimate_score = 0;
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        int16_t chroma[4][4][4];
+        int16_t dct2x2[2][2];
+        int16_t dct4x4[4][4][4];
+
+        p_src = ctx->p_img[1+ch];
+        i_src = ctx->i_img[1+ch];
+        p_dst = ctx->p_fdec[1+ch];
+        i_dst = ctx->i_fdec[1+ch];
+
+        /* calculate the diff */
+        h->pixf.sub8x8( chroma, p_src, i_src, p_dst, i_dst );
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            h->dctf.dct4x4( dct4x4[i], chroma[i] );
+
+            /* copy dc coeff */
+            dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+
+            quant_4x4( dct4x4[i], i_qscale, 1 );
+            scan_zigzag_4x4( mb->block[16+i+ch*4].residual_ac, dct4x4[i] );
+
+            i_decimate_score += x264_mb_decimate_score( mb->block[16+i+ch*4].residual_ac, 15 );
+        }
+
+        h->dctf.dct2x2dc( dct2x2, dct2x2 );
+        quant_2x2_dc( dct2x2, i_qscale, 1 );
+        scan_zigzag_2x2_dc( mb->chroma_dc[ch], dct2x2 );
+
+        if( i_decimate_score < 7 && b_inter )
+        {
+            /* Near null chroma 8x8 block so make it null (bits saving) */
+            for( i = 0; i < 4; i++ )
+            {
+                int x, y;
+                for( x = 0; x < 15; x++ )
+                {
+                    mb->block[16+i+ch*4].residual_ac[x] = 0;
+                }
+                for( x = 0; x < 4; x++ )
+                {
+                    for( y = 0; y < 4; y++ )
+                    {
+                        dct4x4[i][x][y] = 0;
+                    }
+                }
+            }
+        }
+
+        /* output samples to fdec */
+        h->dctf.idct2x2dc( dct2x2, dct2x2 );
+        dequant_2x2_dc( dct2x2, i_qscale );  /* XXX not inversed */
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            dequant_4x4( dct4x4[i], i_qscale );
+
+            /* copy dc coeff */
+            dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+
+            h->dctf.idct4x4( chroma[i], dct4x4[i] );
+        }
+        h->pixf.add8x8( p_dst, i_dst, chroma );
+    }
+}
+
+static int x264_mb_pred_mode4x4_fix( int i_mode )
+{
+    if( i_mode == I_PRED_4x4_DC_LEFT || i_mode == I_PRED_4x4_DC_TOP || i_mode == I_PRED_4x4_DC_128 )
+    {
+        return I_PRED_4x4_DC;
+    }
+    return i_mode;
+}
+static int x264_mb_pred_mode16x16_fix( int i_mode )
+{
+    if( i_mode == I_PRED_16x16_DC_LEFT || i_mode == I_PRED_16x16_DC_TOP || i_mode == I_PRED_16x16_DC_128 )
+    {
+        return I_PRED_16x16_DC;
+    }
+    return i_mode;
+}
+static int x264_mb_pred_mode8x8_fix( int i_mode )
+{
+    if( i_mode == I_PRED_CHROMA_DC_LEFT || i_mode == I_PRED_CHROMA_DC_TOP || i_mode == I_PRED_CHROMA_DC_128 )
+    {
+        return I_PRED_CHROMA_DC;
+    }
+    return i_mode;
+}
+
+typedef struct
+{
+    /* conduct the analysis using this lamda and QP */
+    int i_lambda;
+    int i_qp;
+
+    /* Edge histogramme (only luma) */
+    int i_edge_4x4[4][4][9];    /* mode 2 isn't calculated (DC) */
+    int i_edge_16x16[4];        /* mode 2 isn't calculated (DC) */
+
+    /* I: Intra part */
+    /* Luma part 16x16 and 4x4 modes stats */
+    int i_sad_i16x16;
+    int i_predict16x16;
+
+    int i_sad_i4x4;
+    int i_predict4x4[4][4];
+
+    /* Chroma part */
+    int i_sad_i8x8;
+    int i_predict8x8;
+
+    /* II: Inter part */
+    int i_sad_p16x16;
+    int i_ref_p16x16;
+    int i_mv_p16x16[2];
+
+    int i_sad_p16x8;
+    int i_ref_p16x8;
+    int i_mv_p16x8[2][2];
+
+    int i_sad_p8x16;
+    int i_ref_p8x16;
+    int i_mv_p8x16[2][2];
+
+    int i_sad_p8x8;
+    int i_ref_p8x8;
+    int i_sub_partition_p8x8[4];
+    int i_mv_p8x8[4][4][2];
+
+} x264_mb_analysis_t;
+
+
+static const int i_qp0_cost_table[52] =
+{
+   1, 1, 1, 1, 1, 1, 1, 1,
+   1, 1, 1, 1,
+   1, 1, 1, 1, 2, 2, 2, 2,
+   3, 3, 3, 4, 4, 4, 5, 6,
+   6, 7, 8, 9,10,11,13,14,
+  16,18,20,23,25,29,32,36,
+  40,45,51,57,64,72,81,91
+};
+
+
+static void x264_macroblock_analyse_edge( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    uint8_t *p_img = mb->context->p_img[0];;
+    int      i_img = mb->context->i_img[0];
+
+    int dx, dy;
+    int x,  y;
+    int i;
+
+#define FIX8( f ) ( (int)((f) * 256))
+    /* init stats (16x16) */
+    for( i = 0; i < 4; i++ )
+    {
+        res->i_edge_16x16[i] = 0;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            /* init stats (4x4) */
+            for( i = 0; i < 9; i++ )
+            {
+                res->i_edge_4x4[y][x][i] = 0;
+            }
+
+            /* FIXME real interval 0-4 except for border mb */
+            for( dy = (y==0 ? 1:0); dy < (y==3?3:4); dy++ )
+            {
+                for( dx = (x==0?1:0); dx < (x==3?3:4); dx++ )
+                {
+                    uint8_t *pix = &p_img[(y*4+dy)*i_img+(x+dx)];
+                    int dgx, dgy;
+                    int Ryx;
+                    int Ag;
+                    int Dg;
+
+
+                    dgx = (pix[-1*i_img-1]+2*pix[-1*i_img+0]+pix[-1*i_img+1]) -
+                          (pix[ 1*i_img-1]+2*pix[ 1*i_img+0]+pix[ 1*i_img+1]);
+
+
+                    dgy = (pix[-1*i_img+1]+2*pix[ 0*i_img+1]+pix[ 1*i_img+1]) -
+                          (pix[-1*i_img-1]+2*pix[ 0*i_img-1]+pix[ 1*i_img-1]);
+
+                    /* XXX angle to test/verify */
+                    Ag = abs( dgx ) + abs( dgy );
+
+                    if( dgx == 0 )
+                    {
+                        Ryx = (4*256)<<8;
+                    }
+                    else
+                    {
+                        Ryx = ( dgy << 8 )/ dgx;
+                    }
+
+                    if( abs(Ryx) >= FIX8(5.027339) )
+                    {
+                        Dg = I_PRED_4x4_V;
+                    }
+                    else if( abs(Ryx) <= FIX8(0.198912) )
+                    {
+                        Dg = I_PRED_4x4_H;
+                    }
+                    else if( Ryx > FIX8(0.198912) && Ryx <= FIX8(0.668179) )
+                    {
+                        Dg = I_PRED_4x4_HD;
+                    }
+                    else if( Ryx > FIX8(0.668179) && Ryx <= FIX8(1.496606) )
+                    {
+                        Dg = I_PRED_4x4_DDR;
+                    }
+                    else if( Ryx > FIX8(1.496606) && Ryx <= FIX8(5.027339) )
+                    {
+                        Dg = I_PRED_4x4_VR;
+                    }
+                    else if( Ryx > FIX8(-5.027339) && Ryx <= FIX8(-1.496606) )
+                    {
+                        Dg = I_PRED_4x4_VL;
+                    }
+                    else if( Ryx > FIX8(-1.496606) && Ryx <= FIX8(-0.668179) )
+                    {
+                        Dg = I_PRED_4x4_DDL;
+                    }
+                    else if( Ryx > FIX8(-0.668179) && Ryx <= FIX8(-0.198912) )
+                    {
+                        Dg = I_PRED_4x4_HU;
+                    }
+                    else
+                    {
+                        /* Should never occur */
+                        fprintf( stderr, "mmh bad edge dectection function\n" );
+                        Dg = I_PRED_4x4_DC;
+                    }
+                    res->i_edge_4x4[y][x][Dg] += Ag;
+
+                    if( abs(Ryx) > FIX8(2.414214) )
+                    {
+                        Dg = I_PRED_16x16_V;
+                    }
+                    else if( abs(Ryx) < FIX8(0.414214) )
+                    {
+                        Dg = I_PRED_16x16_H;
+                    }
+                    else
+                    {
+                        Dg = I_PRED_16x16_P;
+                    }
+                    res->i_edge_16x16[Dg] += Ag;
+                }
+            }
+        }
+    }
+#undef FIX8
+}
+
+static void x264_macroblock_analyse_i16x16( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    uint8_t *p_dst = mb->context->p_fdec[0];
+    uint8_t *p_src = mb->context->p_img[0];
+    int      i_dst = mb->context->i_fdec[0];
+    int      i_src = mb->context->i_img[0];
+
+    int i;
+    int i_max;
+    int predict_mode[4];
+
+    res->i_sad_i16x16 = -1;
+
+    /* 16x16 prediction selection */
+    predict_16x16_mode_available( mb, predict_mode, &i_max );
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_16x16[i_mode]( p_dst, i_dst );
+
+        /* we calculate the diff and get the square sum of the diff */
+        i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_dst, p_src, i_src ) +
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix(i_mode) );
+        /* if i_score is lower it is better */
+        if( res->i_sad_i16x16 == -1 || res->i_sad_i16x16 > i_sad )
+        {
+            res->i_predict16x16 = i_mode;
+            res->i_sad_i16x16     = i_sad;
+        }
+    }
+}
+
+static void x264_macroblock_analyse_i4x4( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    int i, idx;
+
+    int i_max;
+    int predict_mode[9];
+
+    uint8_t *p_dst = mb->context->p_fdec[0];
+    uint8_t *p_src = mb->context->p_img[0];
+    int      i_dst = mb->context->i_fdec[0];
+    int      i_src = mb->context->i_img[0];
+
+    res->i_sad_i4x4 = 0;
+
+    /* 4x4 prediction selection */
+    for( idx = 0; idx < 16; idx++ )
+    {
+        uint8_t *p_src_by;
+        uint8_t *p_dst_by;
+        int     i_best;
+        int x, y;
+        int i_pred_mode;
+        int i_th;
+
+        i_pred_mode= predict_pred_intra4x4_mode( h, mb, idx );
+        x = block_idx_x[idx];
+        y = block_idx_y[idx];
+
+        i_th = res->i_edge_4x4[y][x][0];
+        if( i_th < res->i_edge_4x4[y][x][1] ) i_th = res->i_edge_4x4[y][x][1];
+        if( i_th < res->i_edge_4x4[y][x][3] ) i_th = res->i_edge_4x4[y][x][3];
+        if( i_th < res->i_edge_4x4[y][x][4] ) i_th = res->i_edge_4x4[y][x][4];
+        if( i_th < res->i_edge_4x4[y][x][5] ) i_th = res->i_edge_4x4[y][x][5];
+        if( i_th < res->i_edge_4x4[y][x][6] ) i_th = res->i_edge_4x4[y][x][6];
+        if( i_th < res->i_edge_4x4[y][x][7] ) i_th = res->i_edge_4x4[y][x][7];
+        if( i_th < res->i_edge_4x4[y][x][8] ) i_th = res->i_edge_4x4[y][x][8];
+        i_th /= 2;
+
+        res->i_edge_4x4[y][x][2] = i_th;
+
+        p_src_by = p_src + 4 * x + 4 * y * i_src;
+        p_dst_by = p_dst + 4 * x + 4 * y * i_dst;
+
+        i_best = -1;
+        predict_4x4_mode_available( mb, idx, predict_mode, &i_max );
+        for( i = 0; i < i_max; i++ )
+        {
+            int i_sad;
+            int i_mode;
+            int i_fmode;
+
+            i_mode = predict_mode[i];
+            i_fmode = x264_mb_pred_mode4x4_fix( i_mode );
+
+            if( res->i_edge_4x4[y][x][i_fmode] < i_th )
+            {
+                continue;
+            }
+
+            /* we do the prediction */
+            h->predict_4x4[i_mode]( p_dst_by, i_dst );
+
+            /* we calculate diff and get the square sum of the diff */
+            i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_dst, p_src_by, i_src );
+
+            i_sad += res->i_lambda * (i_pred_mode == i_fmode ? 1 : 4);
+
+            /* if i_score is lower it is better */
+            if( i_best == -1 || i_best > i_sad )
+            {
+                res->i_predict4x4[x][y] = i_mode;
+                i_best = i_sad;
+            }
+        }
+        res->i_sad_i4x4 += i_best;
+
+        /* we need to encode this mb now (for next ones) */
+        mb->block[idx].i_intra4x4_pred_mode = res->i_predict4x4[x][y];
+        h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_dst );
+        x264_mb_encode_4x4( h, mb, idx, res->i_qp );
+    }
+    res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
+}
+
+static void x264_macroblock_analyse_intra_chroma( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    int i;
+
+    int i_max;
+    int predict_mode[9];
+
+    uint8_t *p_dstc[2], *p_srcc[2];
+    int      i_dstc[2], i_srcc[2];
+
+    /* 8x8 prediction selection for chroma */
+    p_dstc[0] = mb->context->p_fdec[1]; i_dstc[0] = mb->context->i_fdec[1];
+    p_dstc[1] = mb->context->p_fdec[2]; i_dstc[1] = mb->context->i_fdec[2];
+    p_srcc[0] = mb->context->p_img[1];  i_srcc[0] = mb->context->i_img[1];
+    p_srcc[1] = mb->context->p_img[2];  i_srcc[1] = mb->context->i_img[2];
+
+    predict_8x8_mode_available( mb, predict_mode, &i_max );
+    res->i_sad_i8x8 = -1;
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_8x8[i_mode]( p_dstc[0], i_dstc[0] );
+        h->predict_8x8[i_mode]( p_dstc[1], i_dstc[1] );
+
+        /* we calculate the cost */
+        i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_dstc[0], p_srcc[0], i_srcc[0] ) +
+                h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_dstc[1], p_srcc[1], i_srcc[1] ) +
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix(i_mode) );
+
+        /* if i_score is lower it is better */
+        if( res->i_sad_i8x8 == -1 || res->i_sad_i8x8 > i_sad )
+        {
+            res->i_predict8x8 = i_mode;
+            res->i_sad_i8x8     = i_sad;
+        }
+    }
+}
+
+static void x264_macroblock_analyse_inter_p8x8( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    x264_mb_context_t *ctx = mb->context;
+    int i_ref = res->i_ref_p16x16;
+
+    uint8_t *p_fref = ctx->p_fref0[i_ref][0];
+    int      i_fref = ctx->i_fref0[i_ref][0];
+    uint8_t *p_img  = ctx->p_img[0];
+    int      i_img  = ctx->i_img[0];
+
+    int i;
+
+    res->i_ref_p8x8 = i_ref;
+    res->i_sad_p8x8 = 0;
+    mb->i_partition = D_8x8;
+
+    for( i = 0; i < 4; i++ )
+    {
+        static const int test8x8_mode[4] = { D_L0_8x8, D_L0_8x4, D_L0_4x8, D_L0_4x4 };
+        static const int test8x8_pix[4]  = { PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 };
+        static const int test8x8_pos_x[4][4] = { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 0, 0 }, { 0, 4, 0, 4 } };
+        static const int test8x8_pos_y[4][4] = { { 0, 0, 0, 0 }, { 0, 4, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 4, 4 } };
+        int i_test;
+        int mvp[4][2];
+        int mv[4][2];
+
+        int x, y;
+        int i_sub;
+        int i_b_satd;
+
+        y = 8 * (i / 2);
+        x = 8 * (i % 2);
+        i_b_satd = -1;
+
+        i_test = 0;
+        /* FIXME as it's tooooooo slow test only 8x8 */
+        //for( i_test = 0; i_test < 4; i_test++ )
+        {
+            int i_satd;
+
+            i_satd = 0;
+
+            mb->i_sub_partition[i] = test8x8_mode[i_test];
+
+            for( i_sub = 0; i_sub < mb_sub_partition_count( test8x8_mode[i_test] ); i_sub++ )
+            {
+                x264_macroblock_predict_mv( mb, 0, i, i_sub, &mvp[i_sub][0], &mvp[i_sub][1] );
+                mv[i_sub][0] = mvp[i_sub][0];
+                mv[i_sub][1] = mvp[i_sub][1];
+
+                i_satd += x264_me_p_umhexagons( h,
+                                                &p_fref[(y+test8x8_pos_y[i_test][i_sub])*i_fref +x+test8x8_pos_x[i_test][i_sub]], i_fref,
+                                                &p_img[(y+test8x8_pos_y[i_test][i_sub])*i_img +x+test8x8_pos_x[i_test][i_sub]], i_img,
+                                                test8x8_pix[i_test],
+                                                res->i_lambda,
+                                                &mv[i_sub][0], &mv[i_sub][1] );
+                i_satd += res->i_lambda * ( bs_size_se( mv[i_sub][0] - mvp[i_sub][0] ) +
+                                            bs_size_se( mv[i_sub][1] - mvp[i_sub][1] ) );
+            }
+
+            switch( test8x8_mode[i_test] )
+            {
+                case D_L0_8x8:
+                    i_satd += res->i_lambda * bs_size_ue( 0 );
+                    break;
+                case D_L0_8x4:
+                    i_satd += res->i_lambda * bs_size_ue( 1 );
+                    break;
+                case D_L0_4x8:
+                    i_satd += res->i_lambda * bs_size_ue( 2 );
+                    break;
+                case D_L0_4x4:
+                    i_satd += res->i_lambda * bs_size_ue( 3 );
+                    break;
+                default:
+                    fprintf( stderr, "internal error (invalid sub type)\n" );
+                    break;
+            }
+
+            if( i_b_satd == -1 || i_b_satd > i_satd )
+            {
+                i_b_satd = i_satd;
+                res->i_sub_partition_p8x8[i] = test8x8_mode[i_test];;
+                for( i_sub = 0; i_sub < mb_sub_partition_count( test8x8_mode[i_test] ); i_sub++ )
+                {
+                    res->i_mv_p8x8[i][i_sub][0] = mv[i_sub][0];
+                    res->i_mv_p8x8[i][i_sub][1] = mv[i_sub][1];
+                }
+            }
+        }
+
+        res->i_sad_p8x8 += i_b_satd;
+        /* needed for the next block */
+        mb->i_sub_partition[i] = res->i_sub_partition_p8x8[i];
+        for( i_sub = 0; i_sub < mb_sub_partition_count( res->i_sub_partition_p8x8[i] ); i_sub++ )
+        {
+            x264_macroblock_partition_set( mb, 0, i, i_sub,
+                                           res->i_ref_p8x8,
+                                           res->i_mv_p8x8[i][i_sub][0],
+                                           res->i_mv_p8x8[i][i_sub][1] );
+        }
+    }
+
+    res->i_sad_p8x8 += 4*res->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+}
+
+static void x264_macroblock_analyse_inter( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    int i_ref;
+
+    /* int res */
+    res->i_sad_p16x16 = -1;
+    res->i_sad_p16x8  = -1;
+    res->i_sad_p8x16  = -1;
+    res->i_sad_p8x8   = -1;
+
+    /* 16x16 Search on all ref frame */
+    mb->i_type = P_L0;  /* beurk fix that */
+    mb->i_partition = D_16x16;
+    for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
+    {
+        int i_sad;
+        int mvxp, mvyp;
+        int mvx, mvy;
+
+        /* Get the predicted MV */
+        x264_macroblock_partition_set( mb, 0, 0, 0, i_ref, 0, 0 );
+        x264_macroblock_predict_mv( mb, 0, 0, 0, &mvxp, &mvyp );
+
+        mvx = mvxp; mvy = mvyp;
+        i_sad = x264_me_p_umhexagons( h, ctx->p_fref0[i_ref][0], ctx->i_fref0[i_ref][0],
+                                         ctx->p_img[0],         ctx->i_img[0],
+                                         PIXEL_16x16, res->i_lambda, &mvx, &mvy );
+        if( mvx == mvxp && mvy == mvyp )
+        {
+            i_sad -= 16 * res->i_lambda;
+        }
+        else
+        {
+            i_sad += res->i_lambda * (bs_size_se(mvx - mvxp) + bs_size_se(mvy - mvyp));
+        }
+        i_sad += res->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+
+        if( res->i_sad_p16x16 == -1 || i_sad < res->i_sad_p16x16 )
+        {
+            res->i_sad_p16x16   = i_sad;
+            res->i_ref_p16x16   = i_ref;
+            res->i_mv_p16x16[0] = mvx;
+            res->i_mv_p16x16[1] = mvy;
+        }
+    }
+
+    /* Now do the rafinement (using the ref found in 16x16 mode) */
+    i_ref = res->i_ref_p16x16;
+    x264_macroblock_partition_set( mb, 0, 0, 0, i_ref, 0, 0 );
+
+    /* try 16x8 */
+    /* XXX we test i_predict16x16 to try shape with the same direction than edge
+     * We should do a better algo of course (the one with edge dectection to be used
+     * for intra mode too)
+     * */
+
+    if( res->i_predict16x16 != I_PRED_16x16_V )
+    {
+        int mvp[2][2];
+
+        mb->i_partition = D_16x8;
+
+        res->i_ref_p16x8   = i_ref;
+        x264_macroblock_predict_mv( mb, 0, 0, 0, &mvp[0][0], &mvp[0][1] );
+        x264_macroblock_predict_mv( mb, 0, 1, 0, &mvp[1][0], &mvp[1][1] );
+
+        res->i_mv_p16x8[0][0] = mvp[0][0]; res->i_mv_p16x8[0][1] = mvp[0][1];
+        res->i_mv_p16x8[1][0] = mvp[1][0]; res->i_mv_p16x8[1][1] = mvp[1][1];
+
+        res->i_sad_p16x8 = x264_me_p_umhexagons( h,
+                                                 ctx->p_fref0[i_ref][0], ctx->i_fref0[i_ref][0],
+                                                 ctx->p_img[0],          ctx->i_img[0],
+                                                 PIXEL_16x8,
+                                                 res->i_lambda,
+                                                 &res->i_mv_p16x8[0][0], &res->i_mv_p16x8[0][1] ) +
+                           x264_me_p_umhexagons( h,
+                                                 &ctx->p_fref0[i_ref][0][8*ctx->i_fref0[i_ref][0]], ctx->i_fref0[i_ref][0],
+                                                 &ctx->p_img[0][8*ctx->i_img[0]],                   ctx->i_img[0],
+                                                 PIXEL_16x8,
+                                                 res->i_lambda,
+                                                 &res->i_mv_p16x8[1][0], &res->i_mv_p16x8[1][1] );
+
+        res->i_sad_p16x8 += res->i_lambda * ( bs_size_se(res->i_mv_p16x8[0][0] - mvp[0][0] ) +
+                                              bs_size_se(res->i_mv_p16x8[0][1] - mvp[0][1] ) +
+                                              bs_size_se(res->i_mv_p16x8[1][0] - mvp[1][0] ) +
+                                              bs_size_se(res->i_mv_p16x8[1][1] - mvp[1][1] ) );
+
+        res->i_sad_p16x8 += 2*res->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+    }
+
+    /* try 8x16 */
+    if( res->i_predict16x16 != I_PRED_16x16_H )
+    {
+        int mvp[2][2];
+
+        mb->i_partition = D_8x16;
+
+        res->i_ref_p8x16   = i_ref;
+        x264_macroblock_predict_mv( mb, 0, 0, 0, &mvp[0][0], &mvp[0][1] );
+        x264_macroblock_predict_mv( mb, 0, 1, 0, &mvp[1][0], &mvp[1][1] );
+
+        res->i_mv_p8x16[0][0] = mvp[0][0]; res->i_mv_p8x16[0][1] = mvp[0][1];
+        res->i_mv_p8x16[1][0] = mvp[1][0]; res->i_mv_p8x16[1][1] = mvp[1][1];
+
+        res->i_sad_p8x16 = x264_me_p_umhexagons( h,
+                                                 ctx->p_fref0[i_ref][0], ctx->i_fref0[i_ref][0],
+                                                 ctx->p_img[0],          ctx->i_img[0],
+                                                 PIXEL_8x16,
+                                                 res->i_lambda,
+                                                 &res->i_mv_p8x16[0][0], &res->i_mv_p8x16[0][1] ) +
+                           x264_me_p_umhexagons( h,
+                                                 &ctx->p_fref0[i_ref][0][8], ctx->i_fref0[i_ref][0],
+                                                 &ctx->p_img[0][8],          ctx->i_img[0],
+                                                 PIXEL_8x16,
+                                                 res->i_lambda,
+                                                 &res->i_mv_p8x16[1][0], &res->i_mv_p8x16[1][1] );
+
+        res->i_sad_p8x16 += res->i_lambda * ( bs_size_se(res->i_mv_p8x16[0][0] - mvp[0][0] ) +
+                                                bs_size_se(res->i_mv_p8x16[0][1] - mvp[0][1] ) +
+                                                bs_size_se(res->i_mv_p8x16[1][0] - mvp[1][0] ) +
+                                                bs_size_se(res->i_mv_p8x16[1][1] - mvp[1][1] ) );
+        res->i_sad_p8x16 += 2*res->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+    }
+
+    if( 1 )
+    {
+    //    x264_macroblock_analyse_inter_p8x8( h,mb, res );
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_analyse:
+ *****************************************************************************/
+void x264_macroblock_analyse( x264_t *h, x264_macroblock_t *mb, int i_slice_type )
+{
+    x264_mb_analysis_t analysis;
+    int i;
+
+    /* qp TODO */
+    mb->i_qp_delta = 0;
+
+    /* init analysis */
+    analysis.i_qp = x264_clip3( h->pps.i_pic_init_qp + h->sh.i_qp_delta + mb->i_qp_delta, 0, 51 );
+    analysis.i_lambda = i_qp0_cost_table[analysis.i_qp];
+
+    x264_macroblock_analyse_edge( h, mb, &analysis );
+
+    /*--------------------------- Do the analysis ---------------------------*/
+    x264_macroblock_analyse_i16x16( h, mb, &analysis );
+    x264_macroblock_analyse_i4x4  ( h, mb, &analysis );
+    if( i_slice_type == SLICE_TYPE_P )
+    {
+        x264_macroblock_analyse_inter( h, mb, &analysis );
+    }
+
+    /*-------------------- Chose the macroblock mode ------------------------*/
+    /* Do the MB decision */
+    if( i_slice_type == SLICE_TYPE_I )
+    {
+        mb->i_type = analysis.i_sad_i4x4 < analysis.i_sad_i16x16 ? I_4x4 : I_16x16;
+    }
+    else
+    {
+        int i_satd;
+#define BEST_TYPE( type, partition, satd ) \
+        if( satd != -1 && satd < i_satd ) \
+        {   \
+            i_satd = satd;  \
+            mb->i_type = type; \
+            mb->i_partition = partition; \
+        }
+
+        i_satd = analysis.i_sad_i4x4;
+        mb->i_type = I_4x4;
+
+        BEST_TYPE( I_16x16, -1,    analysis.i_sad_i16x16 );
+        BEST_TYPE( P_L0,  D_16x16, analysis.i_sad_p16x16 );
+        BEST_TYPE( P_L0,  D_16x8 , analysis.i_sad_p16x8  );
+        BEST_TYPE( P_L0,  D_8x16 , analysis.i_sad_p8x16  );
+        BEST_TYPE( P_8x8, D_8x8  , analysis.i_sad_p8x8   );
+
+#undef BEST_TYPE
+    }
+
+    if( IS_INTRA( mb->i_type ) )
+    {
+        x264_macroblock_analyse_intra_chroma( h, mb, &analysis );
+    }
+
+    /*-------------------- Update MB from the analysis ----------------------*/
+    switch( mb->i_type )
+    {
+        case I_4x4:
+            for( i = 0; i < 16; i++ )
+            {
+                mb->block[i].i_intra4x4_pred_mode = analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
+            }
+            mb->i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+        case I_16x16:
+            mb->i_intra16x16_pred_mode = analysis.i_predict16x16;
+            mb->i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+        case P_L0:
+            switch( mb->i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_partition_set( mb, 0, 0, 0,
+                                                   analysis.i_ref_p16x16, analysis.i_mv_p16x16[0], analysis.i_mv_p16x16[1] );
+                    break;
+                case D_16x8:
+                    x264_macroblock_partition_set( mb, 0, 0, 0,
+                                                   analysis.i_ref_p16x8, analysis.i_mv_p16x8[0][0], analysis.i_mv_p16x8[0][1] );
+                    x264_macroblock_partition_set( mb, 0, 1, 0,
+                                                   analysis.i_ref_p16x8, analysis.i_mv_p16x8[1][0], analysis.i_mv_p16x8[1][1] );
+                    break;
+                case D_8x16:
+                    x264_macroblock_partition_set( mb, 0, 0, 0,
+                                                   analysis.i_ref_p8x16, analysis.i_mv_p8x16[0][0], analysis.i_mv_p8x16[0][1] );
+                    x264_macroblock_partition_set( mb, 0, 1, 0,
+                                                   analysis.i_ref_p8x16, analysis.i_mv_p8x16[1][0], analysis.i_mv_p8x16[1][1] );
+                    break;
+                default:
+                    fprintf( stderr, "internal error\n" );
+                    break;
+            }
+            break;
+
+        case P_8x8:
+            for( i = 0; i < 4; i++ )
+            {
+                int i_sub;
+
+                mb->i_sub_partition[i] = analysis.i_sub_partition_p8x8[i];
+                for( i_sub = 0; i_sub < mb_sub_partition_count( mb->i_sub_partition[i] ); i_sub++ )
+                {
+                    x264_macroblock_partition_set( mb, 0, i, i_sub,
+                                                   analysis.i_ref_p8x8,
+                                                   analysis.i_mv_p8x8[i][i_sub][0],
+                                                   analysis.i_mv_p8x8[i][i_sub][1] );
+                }
+            }
+            break;
+
+        default:
+            fprintf( stderr, "internal error\n" );
+            break;
+    }
+}
+
+
+
+/*****************************************************************************
+ * x264_macroblock_encode:
+ *****************************************************************************/
+void x264_macroblock_encode( x264_t *h, x264_macroblock_t *mb )
+{
+    int i;
+
+    int     i_qscale;
+
+    /* quantification scale */
+    i_qscale = x264_clip3( h->pps.i_pic_init_qp + h->sh.i_qp_delta + mb->i_qp_delta, 0, 51 );
+
+    if( mb->i_type == I_16x16 )
+    {
+        /* do the right prediction */
+        h->predict_16x16[mb->i_intra16x16_pred_mode]( mb->context->p_fdec[0], mb->context->i_fdec[0] );
+
+        /* encode the 16x16 macroblock */
+        x264_mb_encode_i16x16( h, mb, i_qscale );
+
+        /* fix the pred mode value */
+        mb->i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix( mb->i_intra16x16_pred_mode );
+    }
+    else if( mb->i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            uint8_t *p_dst_by;
+
+            /* Do the right prediction */
+            p_dst_by = mb->context->p_fdec[0] + 4 * block_idx_x[i] + 4 * block_idx_y[i] * mb->context->i_fdec[0];
+            h->predict_4x4[mb->block[i].i_intra4x4_pred_mode]( p_dst_by, mb->context->i_fdec[0] );
+
+            /* encode one 4x4 block */
+            x264_mb_encode_4x4( h, mb, i, i_qscale );
+
+            /* fix the pred mode value */
+            mb->block[i].i_intra4x4_pred_mode = x264_mb_pred_mode4x4_fix( mb->block[i].i_intra4x4_pred_mode );
+        }
+    }
+    else    /* Inter MB */
+    {
+        x264_mb_context_t *ctx = mb->context;
+        int16_t dct4x4[16][4][4];
+
+        int i8x8, i4x4, idx;
+        int i_decimate_mb = 0;
+
+        /* Motion compensation */
+        x264_macroblock_mc( h, mb, 1 );
+
+        for( i8x8 = 0; i8x8 < 4; i8x8++ )
+        {
+            int16_t luma[4][4];
+            int i_decimate_8x8;
+
+            /* encode one 4x4 block */
+            i_decimate_8x8 = 0;
+            for( i4x4 = 0; i4x4 < 4; i4x4++ )
+            {
+                uint8_t *p_src, *p_dst;
+
+                idx = i8x8 * 4 + i4x4;
+
+                p_src = ctx->p_img[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_img[0];
+                p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+
+                /* we calculate diff */
+                h->pixf.sub4x4( luma, p_src, ctx->i_img[0],p_dst, ctx->i_fdec[0] );
+
+                /* calculate dct coeffs */
+                h->dctf.dct4x4( dct4x4[idx], luma );
+                quant_4x4( dct4x4[idx], i_qscale, 1 );
+
+                scan_zigzag_4x4full( mb->block[idx].luma4x4, dct4x4[idx] );
+                i_decimate_8x8 += x264_mb_decimate_score( mb->block[idx].luma4x4, 16 );
+            }
+
+            /* decimate this 8x8 block */
+            i_decimate_mb += i_decimate_8x8;
+            if( i_decimate_8x8 < 4 )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    int x, y;
+                    idx = i8x8 * 4 + i4x4;
+                    for( i = 0; i < 16; i++ )
+                    {
+                        mb->block[idx].luma4x4[i] = 0;
+                    }
+                    for( x = 0; x < 4; x++ )
+                    {
+                        for( y = 0; y < 4; y++ )
+                        {
+                            dct4x4[idx][x][y] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if( i_decimate_mb < 6 )
+        {
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    for( i = 0; i < 16; i++ )
+                    {
+                        mb->block[i8x8 * 4 + i4x4].luma4x4[i] = 0;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
+            {
+                int16_t luma[4][4];
+                /* TODO we could avoid it if we had decimate this 8x8 block */
+                /* output samples to fdec */
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    uint8_t *p_dst;
+
+                    idx = i8x8 * 4 + i4x4;
+
+                    dequant_4x4( dct4x4[idx], i_qscale );
+                    h->dctf.idct4x4( luma, dct4x4[idx] );
+
+                    /* put pixel to fdec */
+                    p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+                    h->pixf.add4x4( p_dst, ctx->i_fdec[0], luma );
+                }
+            }
+        }
+    }
+
+    /* encode chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps.i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( mb->i_type ) )
+    {
+        /* do the right prediction */
+        h->predict_8x8[mb->i_chroma_pred_mode]( mb->context->p_fdec[1], mb->context->i_fdec[1] );
+        h->predict_8x8[mb->i_chroma_pred_mode]( mb->context->p_fdec[2], mb->context->i_fdec[2] );
+    }
+    else
+    {
+        /* Motion compensation */
+        x264_macroblock_mc( h, mb, 0 );
+    }
+    /* encode the 8x8 blocks */
+    x264_mb_encode_8x8( h, mb, !IS_INTRA( mb->i_type ), i_qscale );
+
+    /* fix the pred mode value */
+    if( IS_INTRA( mb->i_type ) )
+    {
+        mb->i_chroma_pred_mode = x264_mb_pred_mode8x8_fix( mb->i_chroma_pred_mode );
+    }
+
+    /* Calculate the Luma/Chroma patern and non_zero_count */
+    if( mb->i_type == I_16x16 )
+    {
+        mb->i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = array_non_zero_count( mb->block[i].residual_ac, 15 );
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                mb->i_cbp_luma = 0x0f;
+            }
+        }
+    }
+    else
+    {
+        mb->i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = array_non_zero_count( mb->block[i].luma4x4, 16 );
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                mb->i_cbp_luma |= 1 << (i/4);
+            }
+        }
+    }
+
+    /* Calculate the chroma patern */
+    mb->i_cbp_chroma = 0x00;
+    for( i = 0; i < 8; i++ )
+    {
+        mb->block[16+i].i_non_zero_count = array_non_zero_count( mb->block[16+i].residual_ac, 15 );
+        if( mb->block[16+i].i_non_zero_count > 0 )
+        {
+            mb->i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
+        }
+    }
+    if( mb->i_cbp_chroma == 0x00 &&
+        ( array_non_zero_count( mb->chroma_dc[0], 4 ) > 0 || array_non_zero_count( mb->chroma_dc[1], 4 ) ) > 0 )
+    {
+        mb->i_cbp_chroma = 0x01;    /* dc only */
+    }
+
+    /* Check for P_SKIP
+     * XXX: in the me perhaps we should take x264_macroblock_predict_mv_pskip into account
+     *      (if multiple mv give same result)*/
+    if( mb->i_type == P_L0 && mb->i_partition == D_16x16 &&
+        mb->i_cbp_luma == 0x00 && mb->i_cbp_chroma == 0x00 )
+    {
+
+        int i_ref;
+        int mvx, mvy;
+        x264_macroblock_partition_get( mb, 0, 0, 0, &i_ref, &mvx, &mvy );
+
+        if( i_ref == 0 )
+        {
+            int mvxp, mvyp;
+
+            x264_macroblock_predict_mv_pskip( mb, &mvxp, &mvyp );
+            if( mvxp == mvx && mvyp == mvy )
+            {
+                mb->i_type = P_SKIP;
+            }
+        }
+    }
+}
+
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+/****************************************************************************
+ * block_residual_write:
+ ****************************************************************************/
+static void block_residual_write( x264_t *h, bs_t *s, x264_macroblock_t *mb, int i_idx, int *l, int i_count )
+{
+    int level[16], run[16];
+    int i_total, i_trailing;
+    int i_total_zero;
+    int i_last;
+    unsigned int i_sign;
+
+    int i;
+    int i_zero_left;
+    int i_suffix_length;
+
+    /* first find i_last */
+    i_last = i_count - 1;
+    while( i_last >= 0 && l[i_last] == 0 )
+    {
+        i_last--;
+    }
+
+    i_sign = 0;
+    i_total = 0;
+    i_trailing = 0;
+    i_total_zero = 0;
+
+    if( i_last >= 0 )
+    {
+        int b_trailing = 1;
+        int idx = 0;
+
+        /* level and run and total */
+        while( i_last >= 0 )
+        {
+            level[idx] = l[i_last--];
+
+            run[idx] = 0;
+            while( i_last >= 0 && l[i_last] == 0 )
+            {
+                run[idx]++;
+                i_last--;
+            }
+
+            i_total++;
+            i_total_zero += run[idx];
+
+            if( b_trailing && abs( level[idx] ) == 1 && i_trailing < 3 )
+            {
+                i_sign <<= 1;
+                if( level[idx] < 0 )
+                {
+                    i_sign |= 0x01;
+                }
+
+                i_trailing++;
+            }
+            else
+            {
+                b_trailing = 0;
+            }
+
+            idx++;
+        }
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        bs_write_vlc( s, x264_coeff_token[4][i_total][i_trailing] );
+    }
+    else
+    {
+        /* predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = predict_non_zero_code( h, mb, 0 );
+        }
+        else
+        {
+            nC = predict_non_zero_code( h, mb, i_idx );
+        }
+
+        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total][i_trailing] );
+    }
+
+    if( i_total <= 0 )
+    {
+        return;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+    if( i_trailing > 0 )
+    {
+        bs_write( s, i_trailing, i_sign );
+    }
+    for( i = i_trailing; i < i_total; i++ )
+    {
+        int i_level_code;
+
+        /* calculate level code */
+        if( level[i] < 0 )
+        {
+            i_level_code = -2*level[i] - 1;
+        }
+        else /* if( level[i] > 0 ) */
+        {
+            i_level_code = 2 * level[i] - 2;
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code -=2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        }
+
+        if( ( i_level_code >> i_suffix_length ) < 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[i_level_code >> i_suffix_length] );
+            if( i_suffix_length > 0 )
+            {
+                bs_write( s, i_suffix_length, i_level_code );
+            }
+        }
+        else if( i_suffix_length == 0 && i_level_code < 30 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, 4, i_level_code - 14 );
+        }
+        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, i_suffix_length, i_level_code );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_level_prefix[15] );
+            i_level_code -= 15 << i_suffix_length;
+            if( i_suffix_length == 0 )
+            {
+                i_level_code -= 15;
+            }
+
+            if( i_level_code >= ( 1 << 12 ) || i_level_code < 0 )
+            {
+                fprintf( stderr, "OVERFLOW levelcode=%d\n", i_level_code );
+            }
+
+            bs_write( s, 12, i_level_code );    /* check overflow ?? */
+        }
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        }
+    }
+
+    for( i = 0, i_zero_left = i_total_zero; i < i_total - 1; i++ )
+    {
+        int i_zl;
+
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+
+        i_zl = X264_MIN( i_zero_left - 1, 6 );
+
+        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
+
+        i_zero_left -= run[i];
+    }
+}
+
+
+
+
+/*****************************************************************************
+ * x264_macroblock_write:
+ *****************************************************************************/
+void x264_macroblock_write( x264_t *h, bs_t *s, int i_slice_type, x264_macroblock_t *mb )
+{
+    int i;
+    int i_mb_i_offset;
+    int b_sub_ref0 = 1;
+    /* int b_sub_ref1 = 1; */
+
+    switch( i_slice_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_i_offset = 23 + 5;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return;
+    }
+
+    /* PCM special block type UNTESTED */
+    if( mb->i_type == I_PCM )
+    {
+        bs_write_ue( s, i_mb_i_offset + 25 );   /* I_PCM */
+        bs_align_0( s );
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[0][mb->i_mb_y * 16 * h->picture->i_stride[0] + mb->i_mb_x * 16+i] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[1][mb->i_mb_y * 8 * h->picture->i_stride[1] + mb->i_mb_x * 8+i] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[2][mb->i_mb_y * 8 * h->picture->i_stride[2] + mb->i_mb_x * 8+i] );
+        }
+
+        for( i = 0; i < 16 + 8; i++ )
+        {
+            /* special case */
+            mb->block[i].i_non_zero_count = 16;
+        }
+        return;
+    }
+
+    if( mb->i_type == I_4x4 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 0 );    /* I_4x4 */
+    }
+    else if( mb->i_type == I_16x16 )
+    {
+        int i_type = 1 + mb->i_intra16x16_pred_mode + mb->i_cbp_chroma * 4 + ( mb->i_cbp_luma == 0 ? 0 : 12 );
+
+        bs_write_ue( s, i_mb_i_offset + i_type );
+    }
+    else if( mb->i_type == P_L0 )
+    {
+        if( mb->i_partition == D_16x16 )
+        {
+            bs_write_ue( s, 0 );
+        }
+        else if( mb->i_partition == D_16x8 )
+        {
+            bs_write_ue( s, 1 );
+        }
+        else if( mb->i_partition == D_8x16 )
+        {
+            bs_write_ue( s, 2 );
+        }
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        if( mb->partition[0][0].i_ref[0] == 0 &&
+            mb->partition[0][2].i_ref[0] == 0 &&
+            mb->partition[2][0].i_ref[0] == 0 &&
+            mb->partition[2][2].i_ref[0] == 0 )
+        {
+            b_sub_ref0 = 0;
+            bs_write_ue( s, 4 );    /* P_8x8ref0 */
+        }
+        else
+        {
+            b_sub_ref0 = 1;
+            bs_write_ue( s, 3 );
+        }
+    }
+    else
+    {
+        /* TODO B type */
+    }
+
+    if( IS_INTRA( mb->i_type ) )
+    {
+        /* Prediction */
+        if( mb->i_type == I_4x4 )
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                int i_predicted_mode = predict_pred_intra4x4_mode( h, mb, i );
+                int i_mode = mb->block[i].i_intra4x4_pred_mode;
+
+                if( i_predicted_mode == i_mode)
+                {
+                    bs_write( s, 1, 1 );  /* b_prev_intra4x4_pred_mode */
+                }
+                else
+                {
+                    bs_write( s, 1, 0 );  /* b_prev_intra4x4_pred_mode */
+                    if( i_mode < i_predicted_mode )
+                    {
+                        bs_write( s, 3, i_mode );
+                    }
+                    else
+                    {
+                        bs_write( s, 3, i_mode - 1 );
+                    }
+                }
+            }
+        }
+        bs_write_ue( s, mb->i_chroma_pred_mode );
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            switch( mb->i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    bs_write_ue( s, 0 );
+                    break;
+                case D_L0_8x4:
+                    bs_write_ue( s, 1 );
+                    break;
+                case D_L0_4x8:
+                    bs_write_ue( s, 2 );
+                    break;
+                case D_L0_4x4:
+                    bs_write_ue( s, 3 );
+                    break;
+            }
+        }
+        /* ref0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
+        {
+            for( i = 0; i < 4; i++ )
+            {
+                int i_ref;
+                x264_macroblock_partition_get( mb, 0, i, 0, &i_ref, NULL, NULL );
+
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+            }
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int i_part;
+            for( i_part = 0; i_part < mb_sub_partition_count( mb->i_sub_partition[i] ); i_part++ )
+            {
+                int mvx, mvy;
+                int mvxp, mvyp;
+
+                x264_macroblock_partition_get( mb, 0, i, i_part, NULL, &mvx, &mvy );
+                x264_macroblock_predict_mv( mb, 0, i, i_part, &mvxp, &mvyp );
+
+                bs_write_se( s, mvx - mvxp );
+                bs_write_se( s, mvy - mvyp);
+            }
+        }
+    }
+    else if( mb->i_type == B_8x8 )
+    {
+        /* TODO for B-frame (merge it with P_8x8 ?)*/
+    }
+    else if( mb->i_type != B_DIRECT )
+    {
+        /* FIXME -> invalid for B frame */
+
+        /* Motion Vector */
+        int i_part = 1 + ( mb->i_partition != D_16x16 ? 1 : 0 );
+
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                if( mb->i_type == P_L0 )    /* fixme B-frame */
+                {
+                    int i_ref;
+                    x264_macroblock_partition_get( mb, 0, i, 0, &i_ref, NULL, NULL );
+                    bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, i_ref ); /* -1 is correct ? */
+                }
+            }
+        }
+        if( h->sh.i_num_ref_idx_l1_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                /* ref idx part L1 TODO when needed */
+            }
+        }
+
+        for( i = 0; i < i_part; i++ )
+        {
+            if( mb->i_type == P_L0 )
+            {
+                int mvx, mvy;
+                int mvxp, mvyp;
+
+                x264_macroblock_partition_get( mb, 0, i, 0, NULL, &mvx, &mvy );
+                x264_macroblock_predict_mv( mb, 0, i, 0, &mvxp, &mvyp );
+
+                bs_write_se( s, mvx - mvxp );
+                bs_write_se( s, mvy - mvyp);
+            }
+        }
+        /* Same for L1 for B frame */
+    }
+
+    if( mb->i_type != I_16x16 )
+    {
+        if( mb->i_type == I_4x4 )
+        {
+            bs_write_ue( s, intra4x4_cbp_to_golomb[( mb->i_cbp_chroma << 4 )|mb->i_cbp_luma] );
+        }
+        else
+        {
+            bs_write_ue( s, inter_cbp_to_golomb[( mb->i_cbp_chroma << 4 )|mb->i_cbp_luma] );
+        }
+    }
+
+    if( mb->i_cbp_luma > 0 || mb->i_cbp_chroma > 0 || mb->i_type == I_16x16 )
+    {
+        bs_write_se( s, mb->i_qp_delta );
+
+        /* write residual */
+        if( mb->i_type == I_16x16 )
+        {
+            /* DC Luma */
+            block_residual_write( h, s, mb, BLOCK_INDEX_LUMA_DC , mb->luma16x16_dc, 16 );
+
+            if( mb->i_cbp_luma != 0 )
+            {
+                /* AC Luma */
+                for( i = 0; i < 16; i++ )
+                {
+                    block_residual_write( h, s, mb, i, mb->block[i].residual_ac, 15 );
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                if( mb->i_cbp_luma & ( 1 << ( i / 4 ) ) )
+                {
+                    block_residual_write( h, s, mb, i, mb->block[i].luma4x4, 16 );
+                }
+            }
+        }
+
+        if( mb->i_cbp_chroma &0x03 )    /* Chroma DC residual present */
+        {
+            block_residual_write( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[0], 4 );
+            block_residual_write( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[1], 4 );
+        }
+        if( mb->i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write( h, s, mb, 16 + i, mb->block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
+
diff --git a/testing/macroblock-dz.c b/testing/macroblock-dz.c
new file mode 100644
index 00000000..719edcbb
--- /dev/null
+++ b/testing/macroblock-dz.c
@@ -0,0 +1,2266 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock-dz.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "macroblock.h"
+
+static const uint8_t intra4x4_cbp_to_golomb[48]=
+{
+  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+ 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+ 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
+};
+static const uint8_t inter_cbp_to_golomb[48]=
+{
+  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+};
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int quant_mf[6][4][4] =
+{
+    {  { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243},
+       { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}  },
+    {  { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660},
+       { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}  },
+    {  { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194},
+       { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}  },
+    {  {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647},
+       {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}  },
+    {  {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355},
+       {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}  },
+    {  {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893},
+       {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}  }
+};
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+static const int f_deadzone_intra[4][4][2] = /* [num][den] */
+{
+    { {1,2}, {3,7}, {2,5}, {1,3} },
+    { {3,7}, {2,5}, {1,3}, {1,4} },
+    { {2,5}, {1,3}, {1,4}, {1,5} },
+    { {1,3}, {1,4}, {1,5}, {1,5} }
+};
+static const int f_deadzone_inter[4][4][2] = /* [num][den] */
+{
+    { {1,3}, {2,7}, {4,15},{2,9} },
+    { {2,7}, {4,15},{2,9}, {1,6} },
+    { {4,15},{2,9}, {1,6}, {1,7} },
+    { {2,9}, {1,6}, {1,7}, {2,15} }
+};
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+}
+static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
+{
+    int i;
+
+    for( i = 1; i < 16; i++ )
+    {
+        level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+}
+
+static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[1][1];
+}
+
+#if 0
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+    const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / 3;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / ( b_intra ? 3 : 6 );
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+#endif
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            const int f = b_intra ?
+                          (f_deadzone_intra[y][x][0] * ( 1 << i_qbits ) / f_deadzone_intra[y][x][1])
+                          :
+                          (f_deadzone_inter[y][x][0] * ( 1 << i_qbits ) / f_deadzone_inter[y][x][1]);
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = b_intra ?
+                   (f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1])
+                   :
+                   (f_deadzone_inter[0][0][0] * ( 2 << i_qbits ) / f_deadzone_inter[0][0][1]);
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static inline int array_non_zero_count( int *v, int i_count )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < i_count; i++ )
+    {
+        if( v[i] )
+        {
+            i_nz++;
+        }
+    }
+    return i_nz;
+}
+
+void x264_mb_partition_mvd( x264_macroblock_t *mb, int i_list, int i_part, int i_sub, int mvd[2])
+{
+    int mvp[2];
+
+    int x,  y;
+    int w,  h;
+    int dx, dy;
+
+    x264_mb_partition_getxy( mb, i_part, i_sub, &x, &y );
+    x264_mb_partition_size ( mb, i_part, i_sub, &w, &h );
+    x264_mb_predict_mv(  mb, i_list, i_part, i_sub, mvp );
+
+    mvd[0] = mb->partition[x][y].mv[i_list][0] - mvp[0];
+    mvd[1] = mb->partition[x][y].mv[i_list][1] - mvp[1];
+
+    for( dx = 0; dx < w; dx++ )
+    {
+        for( dy = 0; dy < h; dy++ )
+        {
+            mb->partition[x+dx][y+dy].mvd[i_list][0] = mvd[0];
+            mb->partition[x+dx][y+dy].mvd[i_list][1] = mvd[1];
+        }
+    }
+}
+
+/* (ref: JVT-B118)
+ * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
+ * to 0 (low score means set it to null)
+ * Used in inter macroblock (luma and chroma)
+ *  luma: for a 8x8 block: if score < 4 -> null
+ *        for the complete mb: if score < 6 -> null
+ *  chroma: for the complete mb: if score < 7 -> null
+ */
+static int x264_mb_decimate_score( int *dct, int i_max )
+{
+    static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int i_score = 0;
+    int idx = i_max - 1;
+
+    while( idx >= 0 && dct[idx] == 0 )
+    {
+        idx--;
+    }
+
+    while( idx >= 0 )
+    {
+        int i_run;
+
+        if( abs( dct[idx--] ) > 1 )
+        {
+            return 9;
+        }
+
+        i_run = 0;
+        while( idx >= 0 && dct[idx] == 0 )
+        {
+            idx--;
+            i_run++;
+        }
+        i_score += i_ds_table[i_run];
+    }
+
+    return i_score;
+}
+
+void x264_mb_encode_i4x4( x264_t *h, x264_macroblock_t *mb, int idx, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src = ctx->p_img[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_img[0];
+    int      i_src = ctx->i_img[0];
+    uint8_t *p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+    int      i_dst = ctx->i_fdec[0];
+
+    int16_t luma[4][4];
+    int16_t dct4x4[4][4];
+
+    /* we calculate diff */
+    h->pixf.sub4x4( luma, p_src, i_src, p_dst, i_dst );
+
+    /* calculate dct coeffs */
+    h->dctf.dct4x4( dct4x4, luma );
+    quant_4x4( dct4x4, i_qscale, 1 );
+
+    scan_zigzag_4x4full( mb->block[idx].luma4x4, dct4x4 );
+
+    /* output samples to fdec */
+    x264_mb_dequant_4x4( dct4x4, i_qscale );
+    h->dctf.idct4x4( luma, dct4x4 );
+
+    /* put pixel to fdec */
+    h->pixf.add4x4( p_dst, i_dst, luma );
+}
+
+static void x264_mb_encode_i16x16( x264_t *h, x264_macroblock_t *mb, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src = ctx->p_img[0];
+    int      i_src = ctx->i_img[0];
+    uint8_t *p_dst = ctx->p_fdec[0];
+    int      i_dst = ctx->i_fdec[0];
+
+    int16_t luma[16][4][4];
+    int16_t dct4x4[16+1][4][4];
+
+    int i;
+
+    /* calculate the diff */
+    h->pixf.sub16x16( luma, p_src, i_src, p_dst, i_dst );
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        h->dctf.dct4x4( dct4x4[i+1], luma[i] );
+
+        /* copy dc coeff */
+        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
+
+        quant_4x4( dct4x4[1+i], i_qscale, 1 );
+        scan_zigzag_4x4( mb->block[i].residual_ac, dct4x4[1+i] );
+    }
+
+    h->dctf.dct4x4dc( dct4x4[0], dct4x4[0] );
+    quant_4x4_dc( dct4x4[0], i_qscale );
+    scan_zigzag_4x4full( mb->luma16x16_dc, dct4x4[0] );
+
+    /* output samples to fdec */
+    h->dctf.idct4x4dc( dct4x4[0], dct4x4[0] );
+    x264_mb_dequant_4x4_dc( dct4x4[0], i_qscale );  /* XXX not inversed */
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        x264_mb_dequant_4x4( dct4x4[1+i], i_qscale );
+
+        /* copy dc coeff */
+        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+
+        h->dctf.idct4x4( luma[i], dct4x4[i+1] );
+    }
+    /* put pixels to fdec */
+    h->pixf.add16x16( p_dst, i_dst, luma );
+}
+
+static void x264_mb_encode_8x8( x264_t *h, x264_macroblock_t *mb, int b_inter, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src, *p_dst;
+    int      i_src, i_dst;
+
+    int i, ch;
+    int i_decimate_score = 0;
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        int16_t chroma[4][4][4];
+        int16_t dct2x2[2][2];
+        int16_t dct4x4[4][4][4];
+
+        p_src = ctx->p_img[1+ch];
+        i_src = ctx->i_img[1+ch];
+        p_dst = ctx->p_fdec[1+ch];
+        i_dst = ctx->i_fdec[1+ch];
+
+        /* calculate the diff */
+        h->pixf.sub8x8( chroma, p_src, i_src, p_dst, i_dst );
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            h->dctf.dct4x4( dct4x4[i], chroma[i] );
+
+            /* copy dc coeff */
+            dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+
+            quant_4x4( dct4x4[i], i_qscale, b_inter ? 0 : 1 );
+            scan_zigzag_4x4( mb->block[16+i+ch*4].residual_ac, dct4x4[i] );
+
+            i_decimate_score += x264_mb_decimate_score( mb->block[16+i+ch*4].residual_ac, 15 );
+        }
+
+        h->dctf.dct2x2dc( dct2x2, dct2x2 );
+        quant_2x2_dc( dct2x2, i_qscale, b_inter ? 0 : 1  );
+        scan_zigzag_2x2_dc( mb->chroma_dc[ch], dct2x2 );
+
+        if( i_decimate_score < 7 && b_inter )
+        {
+            /* Near null chroma 8x8 block so make it null (bits saving) */
+            for( i = 0; i < 4; i++ )
+            {
+                int x, y;
+                for( x = 0; x < 15; x++ )
+                {
+                    mb->block[16+i+ch*4].residual_ac[x] = 0;
+                }
+                for( x = 0; x < 4; x++ )
+                {
+                    for( y = 0; y < 4; y++ )
+                    {
+                        dct4x4[i][x][y] = 0;
+                    }
+                }
+            }
+        }
+
+        /* output samples to fdec */
+        h->dctf.idct2x2dc( dct2x2, dct2x2 );
+        x264_mb_dequant_2x2_dc( dct2x2, i_qscale );  /* XXX not inversed */
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            x264_mb_dequant_4x4( dct4x4[i], i_qscale );
+
+            /* copy dc coeff */
+            dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+
+            h->dctf.idct4x4( chroma[i], dct4x4[i] );
+        }
+        h->pixf.add8x8( p_dst, i_dst, chroma );
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_encode:
+ *****************************************************************************/
+void x264_macroblock_encode( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_mb_context_t *ctx = mb->context;
+    int i;
+
+    int     i_qscale;
+
+    /* quantification scale */
+    i_qscale = mb->i_qp;
+
+    if( mb->i_type == I_16x16 )
+    {
+        /* do the right prediction */
+        h->predict_16x16[mb->i_intra16x16_pred_mode]( ctx->p_fdec[0], ctx->i_fdec[0] );
+
+        /* encode the 16x16 macroblock */
+        x264_mb_encode_i16x16( h, mb, i_qscale );
+
+        /* fix the pred mode value */
+        mb->i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix[mb->i_intra16x16_pred_mode];
+    }
+    else if( mb->i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            uint8_t *p_dst_by;
+
+            /* Do the right prediction */
+            p_dst_by = ctx->p_fdec[0] + 4 * block_idx_x[i] + 4 * block_idx_y[i] * ctx->i_fdec[0];
+            h->predict_4x4[mb->block[i].i_intra4x4_pred_mode]( p_dst_by, ctx->i_fdec[0] );
+
+            /* encode one 4x4 block */
+            x264_mb_encode_i4x4( h, mb, i, i_qscale );
+
+            /* fix the pred mode value */
+            mb->block[i].i_intra4x4_pred_mode = x264_mb_pred_mode4x4_fix[mb->block[i].i_intra4x4_pred_mode];
+        }
+    }
+    else    /* Inter MB */
+    {
+        int16_t dct4x4[16][4][4];
+
+        int i8x8, i4x4, idx;
+        int i_decimate_mb = 0;
+
+        /* Motion compensation */
+        x264_mb_mc( h, mb );
+
+        for( i8x8 = 0; i8x8 < 4; i8x8++ )
+        {
+            int16_t luma[4][4];
+            int i_decimate_8x8;
+
+            /* encode one 4x4 block */
+            i_decimate_8x8 = 0;
+            for( i4x4 = 0; i4x4 < 4; i4x4++ )
+            {
+                uint8_t *p_src, *p_dst;
+
+                idx = i8x8 * 4 + i4x4;
+
+                p_src = ctx->p_img[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_img[0];
+                p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+
+                /* we calculate diff */
+                h->pixf.sub4x4( luma, p_src, ctx->i_img[0],p_dst, ctx->i_fdec[0] );
+
+                /* calculate dct coeffs */
+                h->dctf.dct4x4( dct4x4[idx], luma );
+                quant_4x4( dct4x4[idx], i_qscale, 0 );
+
+                scan_zigzag_4x4full( mb->block[idx].luma4x4, dct4x4[idx] );
+                i_decimate_8x8 += x264_mb_decimate_score( mb->block[idx].luma4x4, 16 );
+            }
+
+            /* decimate this 8x8 block */
+            i_decimate_mb += i_decimate_8x8;
+            if( i_decimate_8x8 < 4 )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    int x, y;
+                    idx = i8x8 * 4 + i4x4;
+                    for( i = 0; i < 16; i++ )
+                    {
+                        mb->block[idx].luma4x4[i] = 0;
+                    }
+                    for( x = 0; x < 4; x++ )
+                    {
+                        for( y = 0; y < 4; y++ )
+                        {
+                            dct4x4[idx][x][y] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if( i_decimate_mb < 6 )
+        {
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    for( i = 0; i < 16; i++ )
+                    {
+                        mb->block[i8x8 * 4 + i4x4].luma4x4[i] = 0;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
+            {
+                int16_t luma[4][4];
+                /* TODO we could avoid it if we had decimate this 8x8 block */
+                /* output samples to fdec */
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    uint8_t *p_dst;
+
+                    idx = i8x8 * 4 + i4x4;
+
+                    x264_mb_dequant_4x4( dct4x4[idx], i_qscale );
+                    h->dctf.idct4x4( luma, dct4x4[idx] );
+
+                    /* put pixel to fdec */
+                    p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+                    h->pixf.add4x4( p_dst, ctx->i_fdec[0], luma );
+                }
+            }
+        }
+    }
+
+    /* encode chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( mb->i_type ) )
+    {
+        /* do the right prediction */
+        h->predict_8x8[mb->i_chroma_pred_mode]( ctx->p_fdec[1], ctx->i_fdec[1] );
+        h->predict_8x8[mb->i_chroma_pred_mode]( ctx->p_fdec[2], ctx->i_fdec[2] );
+    }
+
+    /* encode the 8x8 blocks */
+    x264_mb_encode_8x8( h, mb, !IS_INTRA( mb->i_type ), i_qscale );
+
+    /* fix the pred mode value */
+    if( IS_INTRA( mb->i_type ) )
+    {
+        mb->i_chroma_pred_mode = x264_mb_pred_mode8x8_fix[mb->i_chroma_pred_mode];
+    }
+
+    /* Calculate the Luma/Chroma patern and non_zero_count */
+    if( mb->i_type == I_16x16 )
+    {
+        mb->i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = array_non_zero_count( mb->block[i].residual_ac, 15 );
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                mb->i_cbp_luma = 0x0f;
+            }
+        }
+    }
+    else
+    {
+        mb->i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = array_non_zero_count( mb->block[i].luma4x4, 16 );
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                mb->i_cbp_luma |= 1 << (i/4);
+            }
+        }
+    }
+
+    /* Calculate the chroma patern */
+    mb->i_cbp_chroma = 0x00;
+    for( i = 0; i < 8; i++ )
+    {
+        mb->block[16+i].i_non_zero_count = array_non_zero_count( mb->block[16+i].residual_ac, 15 );
+        if( mb->block[16+i].i_non_zero_count > 0 )
+        {
+            mb->i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
+        }
+    }
+    if( mb->i_cbp_chroma == 0x00 &&
+        ( array_non_zero_count( mb->chroma_dc[0], 4 ) > 0 || array_non_zero_count( mb->chroma_dc[1], 4 ) ) > 0 )
+    {
+        mb->i_cbp_chroma = 0x01;    /* dc only */
+    }
+
+    /* Check for P_SKIP
+     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
+     *      (if multiple mv give same result)*/
+    if( mb->i_type == P_L0 && mb->i_partition == D_16x16 &&
+        mb->i_cbp_luma == 0x00 && mb->i_cbp_chroma == 0x00 )
+    {
+
+        int i_ref;
+        int mvx, mvy;
+        x264_mb_partition_get( mb, 0, 0, 0, &i_ref, &mvx, &mvy );
+
+        if( i_ref == 0 )
+        {
+            int mvp[2];
+
+            x264_mb_predict_mv_pskip( mb, mvp );
+            if( mvp[0] == mvx && mvp[1] == mvy )
+            {
+                mb->i_type = P_SKIP;
+            }
+        }
+    }
+}
+
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+static inline void bs_write_vlc( bs_t *s, vlc_t v )
+{
+    bs_write( s, v.i_size, v.i_bits );
+}
+
+/****************************************************************************
+ * block_residual_write_cavlc:
+ ****************************************************************************/
+static void block_residual_write_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb, int i_idx, int *l, int i_count )
+{
+    int level[16], run[16];
+    int i_total, i_trailing;
+    int i_total_zero;
+    int i_last;
+    unsigned int i_sign;
+
+    int i;
+    int i_zero_left;
+    int i_suffix_length;
+
+    /* first find i_last */
+    i_last = i_count - 1;
+    while( i_last >= 0 && l[i_last] == 0 )
+    {
+        i_last--;
+    }
+
+    i_sign = 0;
+    i_total = 0;
+    i_trailing = 0;
+    i_total_zero = 0;
+
+    if( i_last >= 0 )
+    {
+        int b_trailing = 1;
+        int idx = 0;
+
+        /* level and run and total */
+        while( i_last >= 0 )
+        {
+            level[idx] = l[i_last--];
+
+            run[idx] = 0;
+            while( i_last >= 0 && l[i_last] == 0 )
+            {
+                run[idx]++;
+                i_last--;
+            }
+
+            i_total++;
+            i_total_zero += run[idx];
+
+            if( b_trailing && abs( level[idx] ) == 1 && i_trailing < 3 )
+            {
+                i_sign <<= 1;
+                if( level[idx] < 0 )
+                {
+                    i_sign |= 0x01;
+                }
+
+                i_trailing++;
+            }
+            else
+            {
+                b_trailing = 0;
+            }
+
+            idx++;
+        }
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
+    }
+    else
+    {
+        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = x264_mb_predict_non_zero_code( h, mb, 0 );
+        }
+        else
+        {
+            nC = x264_mb_predict_non_zero_code( h, mb, i_idx );
+        }
+
+        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total*4+i_trailing] );
+    }
+
+    if( i_total <= 0 )
+    {
+        return;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+    if( i_trailing > 0 )
+    {
+        bs_write( s, i_trailing, i_sign );
+    }
+    for( i = i_trailing; i < i_total; i++ )
+    {
+        int i_level_code;
+
+        /* calculate level code */
+        if( level[i] < 0 )
+        {
+            i_level_code = -2*level[i] - 1;
+        }
+        else /* if( level[i] > 0 ) */
+        {
+            i_level_code = 2 * level[i] - 2;
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code -=2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        }
+
+        if( ( i_level_code >> i_suffix_length ) < 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[i_level_code >> i_suffix_length] );
+            if( i_suffix_length > 0 )
+            {
+                bs_write( s, i_suffix_length, i_level_code );
+            }
+        }
+        else if( i_suffix_length == 0 && i_level_code < 30 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, 4, i_level_code - 14 );
+        }
+        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, i_suffix_length, i_level_code );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_level_prefix[15] );
+            i_level_code -= 15 << i_suffix_length;
+            if( i_suffix_length == 0 )
+            {
+                i_level_code -= 15;
+            }
+
+            if( i_level_code >= ( 1 << 12 ) || i_level_code < 0 )
+            {
+                fprintf( stderr, "OVERFLOW levelcode=%d\n", i_level_code );
+            }
+
+            bs_write( s, 12, i_level_code );    /* check overflow ?? */
+        }
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        }
+    }
+
+    for( i = 0, i_zero_left = i_total_zero; i < i_total - 1; i++ )
+    {
+        int i_zl;
+
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+
+        i_zl = X264_MIN( i_zero_left - 1, 6 );
+
+        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
+
+        i_zero_left -= run[i];
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_write:
+ *****************************************************************************/
+void x264_macroblock_write_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    int i_mb_i_offset;
+    int i;
+
+    switch( h->sh.i_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_i_offset = 23;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return;
+    }
+
+    /* Write:
+      - type
+      - prediction
+      - mv */
+    if( mb->i_type == I_PCM )
+    {
+        /* Untested */
+        bs_write_ue( s, i_mb_i_offset + 25 );
+
+        bs_align_0( s );
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[0][mb->i_mb_y * 16 * h->picture->i_stride[0] + mb->i_mb_x * 16+i] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[1][mb->i_mb_y * 8 * h->picture->i_stride[1] + mb->i_mb_x * 8+i] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[2][mb->i_mb_y * 8 * h->picture->i_stride[2] + mb->i_mb_x * 8+i] );
+        }
+
+        for( i = 0; i < 16 + 8; i++ )
+        {
+            /* special case */
+            mb->block[i].i_non_zero_count = 16;
+        }
+        return;
+    }
+    else if( mb->i_type == I_4x4 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 0 );
+
+        /* Prediction: Luma */
+        for( i = 0; i < 16; i++ )
+        {
+            int i_predicted_mode = x264_mb_predict_intra4x4_mode( h, mb, i );
+            int i_mode = mb->block[i].i_intra4x4_pred_mode;
+
+            if( i_predicted_mode == i_mode)
+            {
+                bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
+            }
+            else
+            {
+                bs_write1( s, 0 );  /* b_prev_intra4x4_pred_mode */
+                if( i_mode < i_predicted_mode )
+                {
+                    bs_write( s, 3, i_mode );
+                }
+                else
+                {
+                    bs_write( s, 3, i_mode - 1 );
+                }
+            }
+        }
+        /* Prediction: chroma */
+        bs_write_ue( s, mb->i_chroma_pred_mode );
+    }
+    else if( mb->i_type == I_16x16 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 1 + mb->i_intra16x16_pred_mode +
+                                            mb->i_cbp_chroma * 4 +
+                                            ( mb->i_cbp_luma == 0 ? 0 : 12 ) );
+        /* Prediction: chroma */
+        bs_write_ue( s, mb->i_chroma_pred_mode );
+    }
+    else if( mb->i_type == P_L0 )
+    {
+        int mvp[2];
+
+        if( mb->i_partition == D_16x16 )
+        {
+            bs_write_ue( s, 0 );
+
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][0].i_ref[0] );
+            }
+            x264_mb_predict_mv( mb, 0, 0, 0, mvp );
+            bs_write_se( s, mb->partition[0][0].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[0][0].mv[0][1] - mvp[1] );
+        }
+        else if( mb->i_partition == D_16x8 )
+        {
+            bs_write_ue( s, 1 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][0].i_ref[0] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][2].i_ref[0] );
+            }
+
+            x264_mb_predict_mv( mb, 0, 0, 0, mvp );
+            bs_write_se( s, mb->partition[0][0].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[0][0].mv[0][1] - mvp[1] );
+
+            x264_mb_predict_mv( mb, 0, 1, 0, mvp );
+            bs_write_se( s, mb->partition[0][2].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[0][2].mv[0][1] - mvp[1] );
+        }
+        else if( mb->i_partition == D_8x16 )
+        {
+            bs_write_ue( s, 2 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][0].i_ref[0] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[2][0].i_ref[0] );
+            }
+
+            x264_mb_predict_mv( mb, 0, 0, 0, mvp );
+            bs_write_se( s, mb->partition[0][0].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[0][0].mv[0][1] - mvp[1] );
+
+            x264_mb_predict_mv( mb, 0, 1, 0, mvp );
+            bs_write_se( s, mb->partition[2][0].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[2][0].mv[0][1] - mvp[1] );
+        }
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        int b_sub_ref0;
+
+        if( mb->partition[0][0].i_ref[0] == 0 &&
+            mb->partition[0][2].i_ref[0] == 0 &&
+            mb->partition[2][0].i_ref[0] == 0 &&
+            mb->partition[2][2].i_ref[0] == 0 )
+        {
+            bs_write_ue( s, 4 );
+            b_sub_ref0 = 0;
+        }
+        else
+        {
+            bs_write_ue( s, 3 );
+            b_sub_ref0 = 1;
+        }
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            switch( mb->i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    bs_write_ue( s, 0 );
+                    break;
+                case D_L0_8x4:
+                    bs_write_ue( s, 1 );
+                    break;
+                case D_L0_4x8:
+                    bs_write_ue( s, 2 );
+                    break;
+                case D_L0_4x4:
+                    bs_write_ue( s, 3 );
+                    break;
+            }
+        }
+        /* ref0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
+        {
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][0].i_ref[0] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[2][0].i_ref[0] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][2].i_ref[0] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[2][2].i_ref[0] );
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int i_part;
+            for( i_part = 0; i_part < x264_mb_partition_count_table[mb->i_sub_partition[i]]; i_part++ )
+            {
+                int mvx, mvy;
+                int mvp[2];
+
+                x264_mb_partition_get( mb, 0, i, i_part, NULL, &mvx, &mvy );
+                x264_mb_predict_mv( mb, 0, i, i_part, mvp );
+
+                bs_write_se( s, mvx - mvp[0] );
+                bs_write_se( s, mvy - mvp[1]);
+            }
+        }
+    }
+    else if( mb->i_type == B_8x8 )
+    {
+        fprintf( stderr, "invalid/unhandled mb_type (B_8x8)\n" );
+        return;
+    }
+    else if( mb->i_type != B_DIRECT )
+    {
+        /* All B mode */
+        /* Motion Vector */
+        int i_part = x264_mb_partition_count_table[mb->i_partition];
+        int i_ref;
+        int mvx, mvy;
+        int mvp[2];
+
+        int b_list0[2];
+        int b_list1[2];
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list0[i] = x264_mb_type_list0_table[mb->i_type][i];
+            b_list1[i] = x264_mb_type_list1_table[mb->i_type][i];
+        }
+
+
+        if( mb->i_partition == D_16x16 )
+        {
+            if( b_list0[0] && b_list1[0] )
+            {
+                bs_write_ue( s, 3 );
+            }
+            else if( b_list1[0] )
+            {
+                bs_write_ue( s, 2 );
+            }
+            else
+            {
+                bs_write_ue( s, 1 );
+            }
+        }
+        else
+        {
+            if( mb->i_type == B_BI_BI )
+            {
+                bs_write_ue( s, 20 + (mb->i_partition == D_16x8 ? 0 : 1 ) );
+            }
+            else if( b_list0[0] && b_list1[0] )
+            {
+                /* B_BI_LX* */
+                bs_write_ue( s, 16 + (b_list0[1]?0:2) + (mb->i_partition == D_16x8?0:1) );
+            }
+            else if( b_list0[1] && b_list1[1] )
+            {
+                /* B_LX_BI */
+                bs_write_ue( s, 12 + (b_list0[1]?0:2) + (mb->i_partition == D_16x8?0:1) );
+            }
+            else if( b_list1[1] )
+            {
+                /* B_LX_L1 */
+                bs_write_ue( s, 6 + (b_list0[0]?2:0) + (mb->i_partition == D_16x8?0:1) );
+            }
+            else if( b_list0[1] )
+            {
+                /* B_LX_L0 */
+                bs_write_ue( s, 4 + (b_list0[0]?0:6) + (mb->i_partition == D_16x8?0:1) );
+            }
+        }
+
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                if( b_list0[i] )
+                {
+                    x264_mb_partition_get( mb, 0, i, 0, &i_ref, NULL, NULL );
+                    bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+                }
+            }
+        }
+        if( h->sh.i_num_ref_idx_l1_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                if( b_list1[i] )
+                {
+                    x264_mb_partition_get( mb, 1, i, 0, &i_ref, NULL, NULL );
+                    bs_write_te( s, h->sh.i_num_ref_idx_l1_active - 1, i_ref );
+                }
+            }
+        }
+
+        for( i = 0; i < i_part; i++ )
+        {
+            if( b_list0[i] )
+            {
+                x264_mb_partition_get( mb, 0, i, 0, NULL, &mvx, &mvy );
+                x264_mb_predict_mv( mb, 0, i, 0, mvp );
+
+                bs_write_se( s, mvx - mvp[0] );
+                bs_write_se( s, mvy - mvp[1] );
+            }
+        }
+        for( i = 0; i < i_part; i++ )
+        {
+            if( b_list1[i] )
+            {
+                x264_mb_partition_get( mb, 1, i, 0, NULL, &mvx, &mvy );
+                x264_mb_predict_mv( mb, 1, i, 0, mvp );
+
+                bs_write_se( s, mvx - mvp[0] );
+                bs_write_se( s, mvy - mvp[1] );
+            }
+        }
+    }
+    else if( mb->i_type == B_DIRECT )
+    {
+        bs_write_ue( s, 0 );
+    }
+    else
+    {
+        fprintf( stderr, "invalid/unhandled mb_type\n" );
+        return;
+    }
+
+    /* Coded block patern */
+    if( mb->i_type == I_4x4 )
+    {
+        bs_write_ue( s, intra4x4_cbp_to_golomb[( mb->i_cbp_chroma << 4 )|mb->i_cbp_luma] );
+    }
+    else if( mb->i_type != I_16x16 )
+    {
+        bs_write_ue( s, inter_cbp_to_golomb[( mb->i_cbp_chroma << 4 )|mb->i_cbp_luma] );
+    }
+
+    /* write residual */
+    if( mb->i_type == I_16x16 )
+    {
+        if( mb->i_mb_x > 0 || mb->i_mb_y > 0 )
+            bs_write_se( s, mb->i_qp - (mb-1)->i_qp);
+        else
+            bs_write_se( s, mb->i_qp - h->pps->i_pic_init_qp - h->sh.i_qp_delta );
+
+        /* DC Luma */
+        block_residual_write_cavlc( h, s, mb, BLOCK_INDEX_LUMA_DC , mb->luma16x16_dc, 16 );
+
+        if( mb->i_cbp_luma != 0 )
+        {
+            /* AC Luma */
+            for( i = 0; i < 16; i++ )
+            {
+                block_residual_write_cavlc( h, s, mb, i, mb->block[i].residual_ac, 15 );
+            }
+        }
+    }
+    else if( mb->i_cbp_luma != 0 || mb->i_cbp_chroma != 0 )
+    {
+        bs_write_se( s, mb->i_qp - h->pps->i_pic_init_qp - h->sh.i_qp_delta );
+
+        for( i = 0; i < 16; i++ )
+        {
+            if( mb->i_cbp_luma & ( 1 << ( i / 4 ) ) )
+            {
+                block_residual_write_cavlc( h, s, mb, i, mb->block[i].luma4x4, 16 );
+            }
+        }
+    }
+    if( mb->i_cbp_chroma != 0 )
+    {
+        /* Chroma DC residual present */
+        block_residual_write_cavlc( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[0], 4 );
+        block_residual_write_cavlc( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[1], 4 );
+        if( mb->i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cavlc( h, s, mb, 16 + i, mb->block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
+
+/*****************************************************************************
+ *
+ * Cabac stuff
+ *
+ *****************************************************************************/
+
+static void x264_cabac_mb_type( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_macroblock_t *mba = mb->context->mba;
+    x264_macroblock_t *mbb = mb->context->mbb;
+    int i_ctxIdxInc    = 0;
+
+    if( h->sh.i_type == SLICE_TYPE_I )
+    {
+
+        if( mba != NULL && mba->i_type != I_4x4 )
+        {
+            i_ctxIdxInc++;
+        }
+        if( mbb != NULL && mbb->i_type != I_4x4 )
+        {
+            i_ctxIdxInc++;
+        }
+
+        if( mb->i_type == I_4x4 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + i_ctxIdxInc, 0 );
+        }
+        else if( mb->i_type == I_PCM )
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + i_ctxIdxInc, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 1 ); /*ctxIdx == 276 */
+        }
+        else    /* I_16x16 */
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + i_ctxIdxInc, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 0 ); /*ctxIdx == 276 */
+
+            x264_cabac_encode_decision( &h->cabac, 3 + 3, ( mb->i_cbp_luma == 0 ? 0 : 1 ));
+            if( mb->i_cbp_chroma == 0 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 3 + 4, 0 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 3 + 4, 1 );
+                x264_cabac_encode_decision( &h->cabac, 3 + 5, ( mb->i_cbp_chroma == 1 ? 0 : 1 ) );
+            }
+            x264_cabac_encode_decision( &h->cabac, 3 + 6, ( (mb->i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+            x264_cabac_encode_decision( &h->cabac, 3 + 7, ( (mb->i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        }
+    }
+    else if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        /* prefix: 14, suffix: 17 */
+        if( mb->i_type == P_L0 )
+        {
+            if( mb->i_partition == D_16x16 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 0 );
+                x264_cabac_encode_decision( &h->cabac, 16, 0 );
+            }
+            else if( mb->i_partition == D_16x8 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            }
+            else if( mb->i_partition == D_8x16 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17, 0 );
+            }
+        }
+        else if( mb->i_type == P_8x8 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 14, 0 );
+            x264_cabac_encode_decision( &h->cabac, 15, 0 );
+            x264_cabac_encode_decision( &h->cabac, 16, 1 );
+        }
+        else if( mb->i_type == I_4x4 )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            x264_cabac_encode_decision( &h->cabac, 17, 0 );
+        }
+        else if( mb->i_type == I_PCM )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 1 ); /*ctxIdx == 276 */
+        }
+        else /* intra 16x16 */
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            /* suffix */
+            x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 0 ); /*ctxIdx == 276 */
+
+            x264_cabac_encode_decision( &h->cabac, 17+1, ( mb->i_cbp_luma == 0 ? 0 : 1 ));
+            if( mb->i_cbp_chroma == 0 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 17+2, 0 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 17+2, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17+2, ( mb->i_cbp_chroma == 1 ? 0 : 1 ) );
+            }
+            x264_cabac_encode_decision( &h->cabac, 17+3, ( (mb->i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+            x264_cabac_encode_decision( &h->cabac, 17+3, ( (mb->i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        }
+    }
+    else
+    {
+        fprintf( stderr, "SLICE_TYPE_B unsupported in x264_macroblock_write_cabac\n" );
+        return;
+    }
+}
+
+static void x264_cabac_mb_intra4x4_pred_mode( x264_t *h, x264_macroblock_t *mb, int i_pred, int i_mode )
+{
+    if( i_pred == i_mode )
+    {
+        /* b_prev_intra4x4_pred_mode */
+        x264_cabac_encode_decision( &h->cabac, 68, 1 );
+    }
+    else
+    {
+        /* b_prev_intra4x4_pred_mode */
+        x264_cabac_encode_decision( &h->cabac, 68, 0 );
+        if( i_mode > i_pred  )
+        {
+            i_mode--;
+        }
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode     )&0x01 );
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 1)&0x01 );
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 2)&0x01 );
+    }
+}
+static void x264_cabac_mb_intra8x8_pred_mode( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_macroblock_t *mba = mb->context->mba;
+    x264_macroblock_t *mbb = mb->context->mbb;
+
+    int i_ctxIdxInc    = 0;
+
+    if( mba != NULL && ( mba->i_type == I_4x4 || mba->i_type == I_16x16 ) && mba->i_chroma_pred_mode != 0 )
+    {
+        i_ctxIdxInc++;
+    }
+    if( mbb != NULL && ( mbb->i_type == I_4x4 || mbb->i_type == I_16x16 ) && mbb->i_chroma_pred_mode != 0 )
+    {
+        i_ctxIdxInc++;
+    }
+    if( mb->i_chroma_pred_mode == 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 64 + i_ctxIdxInc, 0 );
+    }
+    else
+    {
+        x264_cabac_encode_decision( &h->cabac, 64 + i_ctxIdxInc, 1 );
+        x264_cabac_encode_decision( &h->cabac, 64 + 3, ( mb->i_chroma_pred_mode == 1 ? 0 : 1 ) );
+        if( mb->i_chroma_pred_mode > 1 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 64 + 3, ( mb->i_chroma_pred_mode == 2 ? 0 : 1 ) );
+        }
+    }
+}
+
+static void x264_cabac_mb_cbp_luma( x264_t *h, x264_macroblock_t *mb )
+{
+    int idx;
+    x264_macroblock_t *mba;
+    x264_macroblock_t *mbb;
+
+    for( idx = 0;idx < 16; idx+=4 )
+    {
+        int i_ctxIdxInc;
+        int i8x8a, i8x8b;
+        int x, y;
+
+        mba = mb->context->block[idx].mba;
+        mbb = mb->context->block[idx].mbb;
+
+        x = block_idx_x[idx]; y = block_idx_y[idx];
+
+        i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+        i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+
+        i_ctxIdxInc = 0;
+        if( mba != NULL && mba->i_type != I_PCM &&
+           ( IS_SKIP( mba->i_type ) || ((mba->i_cbp_luma >> i8x8a)&0x01) == 0 ) )
+        {
+            i_ctxIdxInc++;
+        }
+        if( mbb != NULL && mbb->i_type != I_PCM &&
+           ( IS_SKIP( mbb->i_type ) || ((mbb->i_cbp_luma >> i8x8b)&0x01) == 0 ) )
+        {
+            i_ctxIdxInc += 2;
+        }
+        x264_cabac_encode_decision( &h->cabac, 73 + i_ctxIdxInc, (mb->i_cbp_luma  >> (idx/4))&0x01 );
+    }
+}
+
+static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_macroblock_t *mba = mb->context->mba;
+    x264_macroblock_t *mbb = mb->context->mbb;
+    int i_ctxIdxInc = 0;
+
+    if( mba != NULL && !IS_SKIP( mba->i_type ) &&
+        ( mba->i_type == I_PCM || mba->i_cbp_chroma != 0 ) )
+    {
+        i_ctxIdxInc++;
+    }
+    if( mbb != NULL && !IS_SKIP( mbb->i_type ) &&
+        ( mbb->i_type == I_PCM || mbb->i_cbp_chroma != 0 ) )
+    {
+        i_ctxIdxInc += 2;
+    }
+    x264_cabac_encode_decision( &h->cabac, 77 + i_ctxIdxInc, (mb->i_cbp_chroma > 0 ? 1 : 0) );
+    if( mb->i_cbp_chroma > 0 )
+    {
+        i_ctxIdxInc = 4;
+        if( mba != NULL && !IS_SKIP( mba->i_type ) &&
+            ( mba->i_type == I_PCM || mba->i_cbp_chroma == 2 ) )
+        {
+            i_ctxIdxInc++;
+        }
+        if( mbb != NULL && !IS_SKIP( mbb->i_type ) &&
+            ( mbb->i_type == I_PCM || mbb->i_cbp_chroma == 2 ) )
+        {
+            i_ctxIdxInc += 2;
+        }
+        x264_cabac_encode_decision( &h->cabac, 77 + i_ctxIdxInc, (mb->i_cbp_chroma > 1 ? 1 : 0) );
+    }
+}
+
+/* TODO check it with != qp per mb */
+static void x264_cabac_mb_qp_delta( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_macroblock_t *mbp = NULL;
+    int i_slice_qp =  h->pps->i_pic_init_qp + h->sh.i_qp_delta;
+    int i_last_dqp = 0;
+    int i_ctxIdxInc = 0;
+    int val;
+
+    if( mb->i_mb_x > 0 || mb->i_mb_y > 0 )
+    {
+        mbp = mb - 1;
+        if( mbp->i_mb_x > 0 || mbp->i_mb_y > 0 )
+        {
+            i_last_dqp = mbp->i_qp - (mbp-1)->i_qp;
+        }
+        else
+        {
+            i_last_dqp = mbp->i_qp - i_slice_qp;
+        }
+    }
+
+    if( mbp != NULL &&
+        !IS_SKIP( mbp->i_type ) && mbp->i_type != I_PCM &&
+        i_last_dqp != 0 &&
+        ( mbp->i_type == I_16x16 || mbp->i_cbp_luma != 0 || mbp->i_cbp_chroma != 0 ) )
+    {
+        i_ctxIdxInc = 1;
+    }
+    if( mbp )
+        val = (mb->i_qp - mbp->i_qp) <= 0 ? (-2*(mb->i_qp - mbp->i_qp)) : (2*(mb->i_qp - mbp->i_qp)-1);
+    else
+        val = (mb->i_qp - i_slice_qp) <= 0 ? (-2*(mb->i_qp -i_slice_qp)) : (2*(mb->i_qp - i_slice_qp)-1);
+
+    while( val > 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac,  60 + i_ctxIdxInc, 1 );
+        if( i_ctxIdxInc < 2 )
+        {
+            i_ctxIdxInc = 2;
+        }
+        else
+        {
+            i_ctxIdxInc = 3;
+        }
+        val--;
+    }
+    x264_cabac_encode_decision( &h->cabac,  60 + i_ctxIdxInc, 0 );
+}
+
+static int x264_cabac_mb_cbf_ctxidxinc( x264_macroblock_t *mb, int i_ctxBlockCat, int i_idx )
+{
+    x264_mb_context_t *ctx = mb->context;
+    x264_macroblock_t *a = NULL;
+    x264_macroblock_t *b = NULL;
+    int i_nza = -1;
+    int i_nzb = -1;
+
+    int i_ctxIdxInc = 0;
+
+    if( i_ctxBlockCat == 0 )
+    {
+        a = ctx->mba;
+        b = ctx->mbb;
+
+        if( a !=NULL && a->i_type == I_16x16 )
+        {
+            i_nza = array_non_zero_count( a->luma16x16_dc, 16 );
+        }
+        if( b !=NULL && b->i_type == I_16x16 )
+        {
+            i_nzb = array_non_zero_count( b->luma16x16_dc, 16 );
+        }
+    }
+    else if( i_ctxBlockCat == 1 || i_ctxBlockCat == 2 )
+    {
+        int i8x8a, i8x8b;
+        int x, y;
+
+        a = ctx->block[i_idx].mba;
+        b = ctx->block[i_idx].mbb;
+
+        x = block_idx_x[i_idx];
+        y = block_idx_y[i_idx];
+
+        i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+        i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+
+        /* FIXME is &0x01 correct ? */
+        if( a != NULL && !IS_SKIP( a->i_type ) && a->i_type != I_PCM &&
+            ((a->i_cbp_luma >> i8x8a)) != 0 )
+        {
+            i_nza = ctx->block[i_idx].bka->i_non_zero_count;
+        }
+        if( b != NULL && !IS_SKIP( b->i_type ) && b->i_type != I_PCM &&
+            ((b->i_cbp_luma >>i8x8b)) != 0 )
+        {
+            i_nzb = ctx->block[i_idx].bkb->i_non_zero_count;
+        }
+    }
+    else if( i_ctxBlockCat == 3 )
+    {
+        a = ctx->mba;
+        b = ctx->mbb;
+
+        if( a != NULL && !IS_SKIP( a->i_type ) && a->i_type != I_PCM &&
+            a->i_cbp_chroma != 0 )
+        {
+            i_nza = array_non_zero_count( a->chroma_dc[i_idx], 4 );
+        }
+        if( b != NULL && !IS_SKIP( b->i_type ) && b->i_type != I_PCM &&
+            b->i_cbp_chroma != 0 )
+        {
+            i_nzb = array_non_zero_count( b->chroma_dc[i_idx], 4 );
+        }
+    }
+    else if( i_ctxBlockCat == 4 )
+    {
+        a = ctx->block[16+i_idx].mba;
+        b = ctx->block[16+i_idx].mbb;
+
+        if( a != NULL && !IS_SKIP( a->i_type ) && a->i_type != I_PCM &&
+            a->i_cbp_chroma == 2 )
+        {
+            i_nza = ctx->block[16+i_idx].bka->i_non_zero_count;
+        }
+        if( b != NULL && !IS_SKIP( b->i_type ) && b->i_type != I_PCM &&
+            b->i_cbp_chroma == 2 )
+        {
+            i_nzb = ctx->block[16+i_idx].bkb->i_non_zero_count;
+        }
+    }
+
+    if( ( a == NULL && IS_INTRA( mb->i_type ) ) || ( a != NULL && a->i_type == I_PCM ) || i_nza > 0 )
+    {
+        i_ctxIdxInc++;
+    }
+    if( ( b == NULL && IS_INTRA( mb->i_type ) ) || ( b != NULL && b->i_type == I_PCM ) || i_nzb > 0 )
+    {
+        i_ctxIdxInc += 2;
+    }
+
+    return i_ctxIdxInc + 4 * i_ctxBlockCat;
+}
+
+void x264_cabac_mb_skip( x264_t *h, x264_macroblock_t *mb, int b_skip )
+{
+    x264_macroblock_t *mba = mb->context->mba;
+    x264_macroblock_t *mbb = mb->context->mbb;
+    int i_ctxIdxInc = 0;
+
+    if( mba != NULL && !IS_SKIP( mba->i_type ) )
+    {
+        i_ctxIdxInc++;
+    }
+    if( mbb != NULL && !IS_SKIP( mbb->i_type ) )
+    {
+        i_ctxIdxInc++;
+    }
+
+    if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        x264_cabac_encode_decision( &h->cabac, 11 + i_ctxIdxInc, b_skip ? 1 : 0 );
+    }
+    else /* SLICE_TYPE_B */
+    {
+        x264_cabac_encode_decision( &h->cabac, 24 + i_ctxIdxInc, b_skip ? 1 : 0 );
+    }
+}
+
+static void x264_cabac_mb_ref( x264_t *h, x264_macroblock_t *mb, int i_list, int i_part )
+{
+    x264_macroblock_t *a;
+    x264_macroblock_t *b;
+
+    int i_ctxIdxInc = 0;
+    int i_ref;
+    int i_refa = -1;
+    int i_refb = -1;
+
+    int x, y, xn, yn;
+
+    x264_mb_partition_getxy( mb, i_part, 0, &x, &y );
+    i_ref = mb->partition[x][y].i_ref[i_list];
+
+
+    /* Left  pixel (-1,0)*/
+    xn = x - 1;
+    a = mb;
+    if( xn < 0 )
+    {
+        xn += 4;
+        a = mb->context->mba;
+    }
+    if( a && !IS_INTRA( a->i_type ) )
+    {
+        i_refa = a->partition[xn][y].i_ref[i_list];
+    }
+
+    /* Up ( pixel(0,-1)*/
+    yn = y - 1;
+    b = mb;
+    if( yn < 0 )
+    {
+        yn += 4;
+        b = mb->context->mbb;
+    }
+    if( b && !IS_INTRA( b->i_type ) )
+    {
+        i_refb = b->partition[x][yn].i_ref[i_list];
+    }
+
+    /* FIXME not complete for B frame (B_DIRECT and B_DIRECT 8x8 sub */
+    if( i_refa > 0 && !IS_SKIP( a->i_type ) )
+    {
+        i_ctxIdxInc++;
+    }
+    if( i_refb > 0 && !IS_SKIP( b->i_type ) )
+    {
+        i_ctxIdxInc += 2;
+    }
+
+    while( i_ref > 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 54 + i_ctxIdxInc, 1 );
+        if( i_ctxIdxInc < 4 )
+        {
+            i_ctxIdxInc = 4;
+        }
+        else
+        {
+            i_ctxIdxInc = 5;
+        }
+        i_ref--;
+    }
+    x264_cabac_encode_decision( &h->cabac, 54 + i_ctxIdxInc, 0 );
+}
+
+static void  x264_cabac_mb_mvd( x264_t *h, int i_ctx, int i_ctx_inc, int mvd )
+{
+    int i_abs = abs( mvd );
+    int i_prefix = X264_MIN( i_abs, 9 );
+    int i;
+
+    for( i = 0; i < i_prefix; i++ )
+    {
+        x264_cabac_encode_decision( &h->cabac, i_ctx + i_ctx_inc, 1 );
+        if( i_ctx_inc < 3 )
+        {
+            i_ctx_inc = 3;
+        }
+        else if( i_ctx_inc < 6 )
+        {
+            i_ctx_inc++;
+        }
+    }
+    if( i_prefix < 9 )
+    {
+        x264_cabac_encode_decision( &h->cabac, i_ctx + i_ctx_inc, 0 );
+    }
+
+    if( i_prefix >= 9 )
+    {
+        int k = 3;
+        int i_suffix = i_abs - 9;
+
+        while( i_suffix >= (1<<k) )
+        {
+            x264_cabac_encode_bypass( &h->cabac, 1 );
+            i_suffix -= 1 << k;
+            k++;
+        }
+        x264_cabac_encode_bypass( &h->cabac, 0 );
+        while( k-- )
+        {
+            x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+        }
+    }
+
+    /* sign */
+    if( mvd > 0 )
+    {
+        x264_cabac_encode_bypass( &h->cabac, 0 );
+    }
+    else if( mvd < 0 )
+    {
+        x264_cabac_encode_bypass( &h->cabac, 1 );
+    }
+}
+
+static void  x264_cabac_mb_mv( x264_t *h, x264_macroblock_t *mb, int i_list, int i_part, int i_sub )
+{
+    x264_macroblock_t *mbn;
+
+    int mvd[2];
+    int x, y, xn, yn;
+    int i_ctxIdxInc;
+
+    int i_absmv0 = 0;
+    int i_absmv1 = 0;
+
+    /* get and update mvd */
+    x264_mb_partition_mvd( mb, i_list, i_part, i_sub, mvd );
+
+    /* get context */
+    x264_mb_partition_getxy( mb, i_part, i_sub, &x, &y );
+
+    /* FIXME not complete for B frame (B_DIRECT and B_DIRECT 8x8 sub */
+    /* Left  pixel (-1,0)*/
+    xn = x - 1;
+    mbn = mb;
+    if( xn < 0 )
+    {
+        xn += 4;
+        mbn = mb->context->mba;
+    }
+    if( mbn && !IS_INTRA( mbn->i_type ) && !IS_SKIP( mbn->i_type) )
+    {
+        i_absmv0 += abs( mbn->partition[xn][y].mvd[i_list][0] );
+        i_absmv1 += abs( mbn->partition[xn][y].mvd[i_list][1] );
+    }
+
+    /* Up ( pixel(0,-1)*/
+    yn = y - 1;
+    mbn = mb;
+    if( yn < 0 )
+    {
+        yn += 4;
+        mbn = mb->context->mbb;
+    }
+    if( mbn && !IS_INTRA( mbn->i_type ) && !IS_SKIP( mbn->i_type) )
+    {
+        i_absmv0 += abs( mbn->partition[x][yn].mvd[i_list][0] );
+        i_absmv1 += abs( mbn->partition[x][yn].mvd[i_list][1] );
+    }
+
+    /* x component */
+    if( i_absmv0 < 3 )
+    {
+        i_ctxIdxInc = 0;
+    }
+    else if( i_absmv0 > 32 )
+    {
+        i_ctxIdxInc = 2;
+    }
+    else
+    {
+        i_ctxIdxInc = 1;
+    }
+
+    x264_cabac_mb_mvd( h, 40, i_ctxIdxInc, mvd[0] );
+
+    /* y component */
+    if( i_absmv1 < 3 )
+    {
+        i_ctxIdxInc = 0;
+    }
+    else if( i_absmv1 > 32 )
+    {
+        i_ctxIdxInc = 2;
+    }
+    else
+    {
+        i_ctxIdxInc = 1;
+    }
+    x264_cabac_mb_mvd( h, 47, i_ctxIdxInc, mvd[1] );
+}
+static void x264_cabac_mb_sub_partition( x264_t *h, int i_sub )
+{
+    switch( i_sub )
+    {
+        case D_L0_8x8:
+            x264_cabac_encode_decision( &h->cabac, 21, 1 );
+            break;
+        case D_L0_8x4:
+            x264_cabac_encode_decision( &h->cabac, 21, 0 );
+            x264_cabac_encode_decision( &h->cabac, 22, 0 );
+            break;
+        case D_L0_4x8:
+            x264_cabac_encode_decision( &h->cabac, 21, 0 );
+            x264_cabac_encode_decision( &h->cabac, 22, 1 );
+            x264_cabac_encode_decision( &h->cabac, 23, 1 );
+            break;
+        case D_L0_4x4:
+            x264_cabac_encode_decision( &h->cabac, 21, 0 );
+            x264_cabac_encode_decision( &h->cabac, 22, 1 );
+            x264_cabac_encode_decision( &h->cabac, 23, 0 );
+            break;
+    }
+}
+
+static void block_residual_write_cabac( x264_t *h, x264_macroblock_t *mb, int i_ctxBlockCat, int i_idx, int *l, int i_count )
+{
+    static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 };
+
+    int i_coeff_abs_m1[16];
+    int i_coeff_sign[16];
+    int i_coeff = 0;
+    int i_last  = 0;
+
+    int i_abslevel1 = 0;
+    int i_abslevelgt1 = 0;
+
+    int i;
+
+    /* i_ctxBlockCat: 0-> DC 16x16  i_idx = 0
+     *                1-> AC 16x16  i_idx = luma4x4idx
+     *                2-> Luma4x4   i_idx = luma4x4idx
+     *                3-> DC Chroma i_idx = iCbCr
+     *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+     */
+
+    //fprintf( stderr, "l[] = " );
+    for( i = 0; i < i_count; i++ )
+    {
+        //fprintf( stderr, "%d ", l[i] );
+        if( l[i] != 0 )
+        {
+            i_coeff_abs_m1[i_coeff] = abs( l[i] ) - 1;
+            i_coeff_sign[i_coeff]   = ( l[i] < 0 ? 1 : 0);
+            i_coeff++;
+
+            i_last = i;
+        }
+    }
+    //fprintf( stderr, "\n" );
+
+    if( i_coeff == 0 )
+    {
+        /* codec block flag */
+        x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( mb, i_ctxBlockCat, i_idx ), 0 );
+        return;
+    }
+
+    /* block coded */
+    x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( mb, i_ctxBlockCat, i_idx ), 1 );
+    for( i = 0; i < i_count - 1; i++ )
+    {
+        int i_ctxIdxInc;
+
+        i_ctxIdxInc = X264_MIN( i, i_count - 2 );
+
+        if( l[i] != 0 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 1 );
+            x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, i == i_last ? 1 : 0 );
+        }
+        else
+        {
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 0 );
+        }
+        if( i == i_last )
+        {
+            break;
+        }
+    }
+
+    for( i = i_coeff - 1; i >= 0; i-- )
+    {
+        int i_prefix;
+        int i_ctxIdxInc;
+
+        /* write coeff_abs - 1 */
+
+        /* prefix */
+        i_prefix = X264_MIN( i_coeff_abs_m1[i], 14 );
+
+        i_ctxIdxInc = (i_abslevelgt1 != 0 ? 0 : X264_MIN( 4, i_abslevel1 + 1 )) + coeff_abs_level_m1_offset[i_ctxBlockCat];
+        if( i_prefix == 0 )
+        {
+            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+        }
+        else
+        {
+            int j;
+            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            i_ctxIdxInc = 5 + X264_MIN( 4, i_abslevelgt1 ) + coeff_abs_level_m1_offset[i_ctxBlockCat];
+            for( j = 0; j < i_prefix - 1; j++ )
+            {
+                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            }
+            if( i_prefix < 14 )
+            {
+                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+            }
+        }
+        /* suffix */
+        if( i_coeff_abs_m1[i] >= 14 )
+        {
+            int k = 0;
+            int i_suffix = i_coeff_abs_m1[i] - 14;
+
+            while( i_suffix >= (1<<k) )
+            {
+                x264_cabac_encode_bypass( &h->cabac, 1 );
+                i_suffix -= 1 << k;
+                k++;
+            }
+            x264_cabac_encode_bypass( &h->cabac, 0 );
+            while( k-- )
+            {
+                x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+            }
+        }
+
+        /* write sign */
+        x264_cabac_encode_bypass( &h->cabac, i_coeff_sign[i] );
+
+
+        if( i_coeff_abs_m1[i] == 0 )
+        {
+            i_abslevel1++;
+        }
+        else
+        {
+            i_abslevelgt1++;
+        }
+    }
+}
+
+
+
+void x264_macroblock_write_cabac( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    int i;
+
+    /* Write the MB type */
+#if 0
+    fprintf( stderr, "[%d,%d] type=%d cbp=%d predc=%d\n",
+             mb->i_mb_x, mb->i_mb_y,
+             1 + mb->i_intra16x16_pred_mode + mb->i_cbp_chroma * 4 + ( mb->i_cbp_luma == 0 ? 0 : 12 ),
+             (mb->i_cbp_chroma << 4)|mb->i_cbp_luma,
+             mb->i_chroma_pred_mode );
+#endif
+    x264_cabac_mb_type( h, mb );
+
+    /* PCM special block type UNTESTED */
+    if( mb->i_type == I_PCM )
+    {
+        bs_align_0( s );    /* not sure */
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[0][mb->i_mb_y * 16 * h->picture->i_stride[0] + mb->i_mb_x * 16+i] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[1][mb->i_mb_y * 8 * h->picture->i_stride[1] + mb->i_mb_x * 8+i] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[2][mb->i_mb_y * 8 * h->picture->i_stride[2] + mb->i_mb_x * 8+i] );
+        }
+
+        for( i = 0; i < 16 + 8; i++ )
+        {
+            /* special case */
+            mb->block[i].i_non_zero_count = 16;
+        }
+
+        x264_cabac_encode_init( &h->cabac, s );
+        return;
+    }
+
+    if( IS_INTRA( mb->i_type ) )
+    {
+        /* Prediction */
+        if( mb->i_type == I_4x4 )
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                x264_cabac_mb_intra4x4_pred_mode( h, mb,
+                                                  x264_mb_predict_intra4x4_mode( h, mb, i ),
+                                                  mb->block[i].i_intra4x4_pred_mode );
+            }
+        }
+        x264_cabac_mb_intra8x8_pred_mode( h, mb );
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            x264_cabac_mb_sub_partition( h, mb->i_sub_partition[i] );
+        }
+        /* ref 0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            for( i = 0; i < 4; i++ )
+            {
+                x264_cabac_mb_ref( h, mb, 0, i );
+            }
+        }
+
+        for( i = 0; i < 4; i++ )
+        {
+            int i_sub;
+            for( i_sub = 0; i_sub < x264_mb_partition_count_table[mb->i_sub_partition[i]]; i_sub++ )
+            {
+                x264_cabac_mb_mv( h, mb, 0, i, i_sub );
+            }
+        }
+    }
+    else if( mb->i_type == B_8x8 )
+    {
+        /* TODO */
+        fprintf( stderr, "Arggg B_8x8\n" );
+    }
+    else if( mb->i_type != B_DIRECT )
+    {
+        /* FIXME -> invalid for B frame */
+
+        /* Motion Vector */
+        int i_part = x264_mb_partition_count_table[mb->i_partition];
+
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                if( mb->i_type == P_L0 )
+                {
+                    x264_cabac_mb_ref( h, mb, 0, i );
+                }
+            }
+        }
+
+        for( i = 0; i < i_part; i++ )
+        {
+            if( mb->i_type == P_L0 )
+            {
+                x264_cabac_mb_mv( h, mb, 0, i, 0 );
+            }
+        }
+    }
+
+    if( mb->i_type != I_16x16 )
+    {
+        x264_cabac_mb_cbp_luma( h, mb );
+        x264_cabac_mb_cbp_chroma( h, mb );
+    }
+
+    if( mb->i_cbp_luma > 0 || mb->i_cbp_chroma > 0 || mb->i_type == I_16x16 )
+    {
+        x264_cabac_mb_qp_delta( h, mb );
+
+        /* write residual */
+        if( mb->i_type == I_16x16 )
+        {
+            /* DC Luma */
+            block_residual_write_cabac( h, mb, 0, 0, mb->luma16x16_dc, 16 );
+
+            if( mb->i_cbp_luma != 0 )
+            {
+                /* AC Luma */
+                for( i = 0; i < 16; i++ )
+                {
+                    block_residual_write_cabac( h, mb, 1, i, mb->block[i].residual_ac, 15 );
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                if( mb->i_cbp_luma & ( 1 << ( i / 4 ) ) )
+                {
+                    block_residual_write_cabac( h, mb, 2, i, mb->block[i].luma4x4, 16 );
+                }
+            }
+        }
+
+        if( mb->i_cbp_chroma &0x03 )    /* Chroma DC residual present */
+        {
+            block_residual_write_cabac( h, mb, 3, 0, mb->chroma_dc[0], 4 );
+            block_residual_write_cabac( h, mb, 3, 1, mb->chroma_dc[1], 4 );
+        }
+        if( mb->i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cabac( h, mb, 4, i, mb->block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
+
diff --git a/tools/.cvsignore b/tools/.cvsignore
new file mode 100644
index 00000000..bafdf1a1
--- /dev/null
+++ b/tools/.cvsignore
@@ -0,0 +1,2 @@
+xyuv
+avc2avi
diff --git a/tools/Jamfile b/tools/Jamfile
new file mode 100644
index 00000000..8507f195
--- /dev/null
+++ b/tools/Jamfile
@@ -0,0 +1,9 @@
+# Jamfile
+
+# Little tool to embed h264 into avi
+Main avc2avi : avc2avi.c ;
+
+# Little YUV I420 player
+LINKLIBS = `sdl-config --libs` ;
+Main xyuv : xyuv.c ;
+
diff --git a/tools/avc2avi.c b/tools/avc2avi.c
new file mode 100644
index 00000000..07f142bb
--- /dev/null
+++ b/tools/avc2avi.c
@@ -0,0 +1,820 @@
+/*****************************************************************************
+ * avc2avi.c: raw h264 -> AVI
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: avc2avi.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <signal.h>
+#define _GNU_SOURCE
+#include <getopt.h>
+
+#ifdef _MSC_VER
+#include <io.h>     /* _setmode() */
+#include <fcntl.h>  /* _O_BINARY */
+#endif
+
+#include "../core/bs.h"
+
+#define DATA_MAX 3000000
+uint8_t data[DATA_MAX];
+
+/* Ctrl-C handler */
+static int     i_ctrl_c = 0;
+static void    SigIntHandler( int a )
+{
+    i_ctrl_c = 1;
+}
+
+typedef struct
+{
+    char *psz_fin;
+    char *psz_fout;
+
+    float f_fps;
+    char  fcc[4];
+} cfg_t;
+
+typedef struct
+{
+    int i_data;
+    int i_data_max;
+    uint8_t *p_data;
+} vbuf_t;
+
+void vbuf_init( vbuf_t * );
+void vbuf_add( vbuf_t *, int i_data, void *p_data );
+void vbuf_reset( vbuf_t * );
+
+typedef struct
+{
+    FILE *f;
+
+    float f_fps;
+    char  fcc[4];
+
+    int   i_width;
+    int   i_height;
+
+    int64_t i_movi;
+    int64_t i_movi_end;
+    int64_t i_riff;
+
+    int      i_frame;
+    int      i_idx_max;
+    uint32_t *idx;
+} avi_t;
+
+void avi_init( avi_t *, FILE *, float, char fcc[4] );
+void avi_write( avi_t *, vbuf_t *, int  );
+void avi_end( avi_t * );
+
+enum nal_unit_type_e
+{
+    NAL_UNKNOWN = 0,
+    NAL_SLICE   = 1,
+    NAL_SLICE_DPA   = 2,
+    NAL_SLICE_DPB   = 3,
+    NAL_SLICE_DPC   = 4,
+    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+    NAL_SEI         = 6,    /* ref_idc == 0 */
+    NAL_SPS         = 7,
+    NAL_PPS         = 8
+    /* ref_idc == 0 for 6,9,10,11,12 */
+};
+enum nal_priority_e
+{
+    NAL_PRIORITY_DISPOSABLE = 0,
+    NAL_PRIORITY_LOW        = 1,
+    NAL_PRIORITY_HIGH       = 2,
+    NAL_PRIORITY_HIGHEST    = 3,
+};
+
+typedef struct
+{
+    int i_ref_idc;  /* nal_priority_e */
+    int i_type;     /* nal_unit_type_e */
+
+    /* This data are raw payload */
+    int     i_payload;
+    uint8_t *p_payload;
+} nal_t;
+
+typedef struct
+{
+    int i_width;
+    int i_height;
+
+    int i_nal_type;
+    int i_ref_idc;
+    int i_idr_pic_id;
+    int i_frame_num;
+
+    int b_key;
+    int i_log2_max_frame_num;
+} h264_t;
+
+void h264_parser_init( h264_t * );
+void h264_parser_parse( h264_t *h, nal_t *n, int *pb_nal_start );
+
+
+static int nal_decode( nal_t *nal, void *p_data, int i_data );
+
+static void Help( void );
+static int  Parse( int argc, char **argv, cfg_t * );
+static int  ParseNAL( nal_t *nal, avi_t *a, h264_t *h, int *pb_slice );
+
+/****************************************************************************
+ * main:
+ ****************************************************************************/
+int main( int argc, char **argv )
+{
+    cfg_t cfg;
+
+    FILE    *fout;
+    FILE    *fin;
+
+    vbuf_t  vb;
+    avi_t   avi;
+    h264_t  h264;
+
+    nal_t nal;
+    int i_frame;
+    int i_data;
+    int b_eof;
+    int b_key;
+    int b_slice;
+
+#ifdef _MSC_VER
+    _setmode(_fileno(stdin), _O_BINARY);    /* thanks to Marcos Morais <morais at dee.ufcg.edu.br> */
+    _setmode(_fileno(stdout), _O_BINARY);
+#endif
+
+    /* Parse command line */
+    if( Parse( argc, argv, &cfg ) < 0 )
+    {
+        return -1;
+    }
+
+    /* Open input */
+    if( cfg.psz_fin == NULL || *cfg.psz_fin == '\0' || !strcmp( cfg.psz_fin, "-" ) )
+        fin = stdin;
+    else
+        fin = fopen( cfg.psz_fin, "rb" );
+    if( fin == NULL )
+    {
+        fprintf( stderr, "cannot open input file\n" );
+        return -1;
+    }
+
+    /* Open output */
+    if( cfg.psz_fout == NULL || *cfg.psz_fout == '\0' || !strcmp( cfg.psz_fout, "-" ) )
+        fout = stdin;
+    else
+        fout = fopen( cfg.psz_fout, "wb" );
+    if( fout == NULL )
+    {
+        fprintf( stderr, "cannot open output file\n" );
+        return -1;
+    }
+
+    /* Init avi */
+    avi_init( &avi, fout, cfg.f_fps, cfg.fcc );
+
+    /* Init parser */
+    h264_parser_init( &h264 );
+
+    /* Control-C handler */
+    signal( SIGINT, SigIntHandler );
+
+    /* Init data */
+    b_eof = 0;
+    b_key = 0;
+    b_slice = 0;
+    i_frame = 0;
+    i_data  = 0;
+
+    /* Alloc space for a nal, used for decoding pps/sps/slice header */
+    nal.p_payload = malloc( DATA_MAX );
+
+    vbuf_init( &vb );
+
+    /* split frame */
+    while( !i_ctrl_c )
+    {
+        uint8_t *p, *p_next, *end;
+        int i_size;
+
+        /* fill buffer */
+        if( i_data < DATA_MAX && !b_eof )
+        {
+            int i_read = fread( &data[i_data], 1, DATA_MAX - i_data, fin );
+            if( i_read <= 0 )
+                b_eof = 1;
+            else
+                i_data += i_read;
+        }
+        if( i_data < 3 )
+            break;
+
+        end = &data[i_data];
+
+        /* Search begin of a NAL */
+        p = &data[0];
+        while( p < end - 3 )
+        {
+            if( p[0] == 0x00 && p[1] == 0x00 && p[2] == 0x01 )
+            {
+                break;
+            }
+            p++;
+        }
+
+        if( p >= end - 3 )
+        {
+            fprintf( stderr, "garbage (i_data = %d)\n", i_data );
+            i_data = 0;
+            continue;
+        }
+
+        /* Search end of NAL */
+        p_next = p + 3;
+        while( p_next < end - 3 )
+        {
+            if( p_next[0] == 0x00 && p_next[1] == 0x00 && p_next[2] == 0x01 )
+            {
+                break;
+            }
+            p_next++;
+        }
+
+        if( p_next == end - 3 && i_data < DATA_MAX )
+            p_next = end;
+
+        /* Compute NAL size */
+        i_size = p_next - p - 3;
+        if( i_size <= 0 )
+        {
+            if( b_eof )
+                break;
+
+            fprintf( stderr, "nal too large (FIXME) ?\n" );
+            i_data = 0;
+            continue;
+        }
+
+        /* Nal start at p+3 with i_size length */
+        nal_decode( &nal, p +3, i_size < 2048 ? i_size : 2048 );
+
+        b_key = h264.b_key;
+
+        if( b_slice && vb.i_data && ( nal.i_type == NAL_SPS || nal.i_type == NAL_PPS ) )
+        {
+            avi_write( &avi, &vb, b_key );
+            vbuf_reset( &vb );
+            b_slice = 0;
+        }
+
+        /* Parse SPS/PPS/Slice */
+        if( ParseNAL( &nal, &avi, &h264, &b_slice ) && vb.i_data > 0 )
+        {
+            avi_write( &avi, &vb, b_key );
+            vbuf_reset( &vb );
+        }
+
+        /* fprintf( stderr, "nal:%d ref:%d\n", nal.i_type, nal.i_ref_idc ); */
+
+        /* Append NAL to buffer */
+        vbuf_add( &vb, i_size + 3, p );
+
+        /* Remove this nal */
+        memmove( &data[0], p_next, end - p_next );
+        i_data -= p_next - &data[0];
+    }
+
+    if( vb.i_data > 0 )
+    {
+        avi_write( &avi, &vb, h264.b_key );
+    }
+
+    avi.i_width  = h264.i_width;
+    avi.i_height = h264.i_height;
+
+    avi_end( &avi );
+
+    /* free mem */
+    free( nal.p_payload );
+
+    fclose( fin );
+    fclose( fout );
+
+    return 0;
+}
+
+/*****************************************************************************
+ * Help:
+ *****************************************************************************/
+static void Help( void )
+{
+    fprintf( stderr,
+             "avc2avi\n"
+             "Syntax: avc2avi [options] [ -i input.h264 ] [ -o output.avi ]\n"
+             "\n"
+             "  -h, --help                  Print this help\n"
+             "\n"
+             "  -i, --input                 Specify input file (default: stdin)\n"
+             "  -o, --output                Specify output file (default: stdout)\n"
+             "\n"
+             "  -f, --fps <float>           Set FPS (default: 25.0)\n"
+             "  -c, --codec <string>        Set the codec fourcc (default: 'h264')\n"
+             "\n" );
+}
+
+/*****************************************************************************
+ * Parse:
+ *****************************************************************************/
+static int  Parse( int argc, char **argv, cfg_t *cfg )
+{
+    /* Set default values */
+    cfg->psz_fin = NULL;
+    cfg->psz_fout = NULL;
+    cfg->f_fps = 25.0;
+    memcpy( cfg->fcc, "h264", 4 );
+
+    /* Parse command line options */
+    opterr = 0; // no error message
+    for( ;; )
+    {
+        int long_options_index;
+        static struct option long_options[] =
+        {
+            { "help",   no_argument,       NULL, 'h' },
+            { "input",  required_argument, NULL, 'i' },
+            { "output", required_argument, NULL, 'o' },
+            { "fps",    required_argument, NULL, 'f' },
+            { "codec",  required_argument, NULL, 'c' },
+            {0, 0, 0, 0}
+        };
+
+        int c;
+
+        c = getopt_long( argc, argv, "hi:o:f:c:",
+                         long_options, &long_options_index);
+
+        if( c == -1 )
+        {
+            break;
+        }
+
+        switch( c )
+        {
+            case 'h':
+                Help();
+                return -1;
+
+            case 0:
+                break;
+            case 'i':
+                cfg->psz_fin = strdup( optarg );
+                break;
+            case 'o':
+                cfg->psz_fout = strdup( optarg );
+                break;
+            case 'f':
+                cfg->f_fps = atof( optarg );
+                break;
+            case 'c':
+                memset( cfg->fcc, ' ', 4 );
+                memcpy( cfg->fcc, optarg, strlen( optarg ) < 4 ? strlen( optarg ) : 4 );
+                break;
+
+            default:
+                fprintf( stderr, "unknown option (%c)\n", optopt );
+                return -1;
+        }
+    }
+
+
+    return 0;
+}
+
+/*****************************************************************************
+ * h264_parser_*:
+ *****************************************************************************/
+void h264_parser_init( h264_t *h )
+{
+    h->i_width = 0;
+    h->i_height = 0;
+    h->b_key = 0;
+    h->i_nal_type = -1;
+    h->i_ref_idc = -1;
+    h->i_idr_pic_id = -1;
+    h->i_frame_num = -1;
+    h->i_log2_max_frame_num = 0;
+}
+void h264_parser_parse( h264_t *h, nal_t *nal, int *pb_nal_start )
+{
+    bs_t s;
+    *pb_nal_start = 0;
+
+    if( nal->i_type == NAL_SPS || nal->i_type == NAL_PPS )
+        *pb_nal_start = 1;
+
+    bs_init( &s, nal->p_payload, nal->i_payload );
+    if( nal->i_type == NAL_SPS )
+    {
+        int i_tmp;
+
+        bs_skip( &s, 8 + 1+1+1 + 5 + 8 );
+        /* sps id */
+        bs_read_ue( &s );
+        /* Skip i_log2_max_frame_num */
+        h->i_log2_max_frame_num = bs_read_ue( &s ) + 4;
+        /* Read poc_type */
+        i_tmp = bs_read_ue( &s );
+        if( i_tmp == 0 )
+        {
+            /* skip i_log2_max_poc_lsb */
+            bs_read_ue( &s );
+        }
+        else if( i_tmp == 1 )
+        {
+            int i_cycle;
+            /* skip b_delta_pic_order_always_zero */
+            bs_skip( &s, 1 );
+            /* skip i_offset_for_non_ref_pic */
+            bs_read_se( &s );
+            /* skip i_offset_for_top_to_bottom_field */
+            bs_read_se( &s );
+            /* read i_num_ref_frames_in_poc_cycle */
+            i_cycle = bs_read_ue( &s ); 
+            if( i_cycle > 256 ) i_cycle = 256;
+            while( i_cycle > 0 )
+            {
+                /* skip i_offset_for_ref_frame */
+                bs_read_se(&s );
+            }
+        }
+        /* i_num_ref_frames */
+        bs_read_ue( &s );
+        /* b_gaps_in_frame_num_value_allowed */
+        bs_skip( &s, 1 );
+
+        /* Read size */
+        h->i_width  = 16 * ( bs_read_ue( &s ) + 1 );
+        h->i_height = 16 * ( bs_read_ue( &s ) + 1 );
+
+        /* b_frame_mbs_only */
+        i_tmp = bs_read( &s, 1 );
+        if( i_tmp == 0 )
+        {
+            bs_skip( &s, 1 );
+        }
+        /* b_direct8x8_inference */
+        bs_skip( &s, 1 );
+
+        /* crop ? */
+        i_tmp = bs_read( &s, 1 );
+        if( i_tmp )
+        {
+            /* left */
+            h->i_width -= 2 * bs_read_ue( &s );
+            /* right */
+            h->i_width -= 2 * bs_read_ue( &s );
+            /* top */
+            h->i_height -= 2 * bs_read_ue( &s );
+            /* bottom */
+            h->i_height -= 2 * bs_read_ue( &s );
+        }
+
+        /* vui: ignored */
+    }
+    else if( nal->i_type >= NAL_SLICE && nal->i_type <= NAL_SLICE_IDR )
+    {
+        int i_tmp;
+
+        /* i_first_mb */
+        bs_read_ue( &s );
+        /* picture type */
+        switch( bs_read_ue( &s ) )
+        {
+            case 0: case 5: /* P */
+            case 1: case 6: /* B */
+            case 3: case 8: /* SP */
+                h->b_key = 0;
+                break;
+            case 2: case 7: /* I */
+                h->b_key = 1;
+                break;
+            case 4: case 9: /* ? */
+                h->b_key = 1;
+                break;
+        }
+        /* pps id */
+        bs_read_ue( &s );
+
+        /* frame num */
+        i_tmp = bs_read( &s, h->i_log2_max_frame_num );
+
+        if( i_tmp != h->i_frame_num )
+            *pb_nal_start = 1;
+
+        h->i_frame_num = i_tmp;
+
+        if( nal->i_type == NAL_SLICE_IDR )
+        {
+            i_tmp = bs_read_ue( &s );
+            if( h->i_nal_type == NAL_SLICE_IDR && h->i_idr_pic_id != i_tmp )
+                *pb_nal_start = 1;
+
+            h->i_idr_pic_id = i_tmp;
+        }
+    }
+    h->i_nal_type = nal->i_type;
+    h->i_ref_idc = nal->i_ref_idc;
+}
+
+
+static int  ParseNAL( nal_t *nal, avi_t *a, h264_t *h, int *pb_slice )
+{
+    int b_flush = 0;
+    int b_start;
+
+    h264_parser_parse( h, nal, &b_start );
+
+    if( b_start && *pb_slice )
+    {
+        b_flush = 1;
+        *pb_slice = 0;
+    }
+
+    if( nal->i_type >= NAL_SLICE && nal->i_type <= NAL_SLICE_IDR )
+        *pb_slice = 1;
+
+    return b_flush;
+}
+
+/*****************************************************************************
+ * vbuf: variable buffer
+ *****************************************************************************/
+void vbuf_init( vbuf_t *v )
+{
+    v->i_data = 0;
+    v->i_data_max = 10000;
+    v->p_data = malloc( v->i_data_max );
+}
+void vbuf_add( vbuf_t *v, int i_data, void *p_data )
+{
+    if( i_data + v->i_data >= v->i_data_max )
+    {
+        v->i_data_max += i_data;
+        v->p_data = realloc( v->p_data, v->i_data_max );
+    }
+    memcpy( &v->p_data[v->i_data], p_data, i_data );
+
+    v->i_data += i_data;
+}
+void vbuf_reset( vbuf_t *v )
+{
+    v->i_data = 0;
+}
+
+/*****************************************************************************
+ * avi:
+ *****************************************************************************/
+void avi_write_uint16( avi_t *a, uint16_t w )
+{
+    fputc( ( w      ) & 0xff, a->f );
+    fputc( ( w >> 8 ) & 0xff, a->f );
+}
+
+void avi_write_uint32( avi_t *a, uint32_t dw )
+{
+    fputc( ( dw      ) & 0xff, a->f );
+    fputc( ( dw >> 8 ) & 0xff, a->f );
+    fputc( ( dw >> 16) & 0xff, a->f );
+    fputc( ( dw >> 24) & 0xff, a->f );
+}
+
+void avi_write_fourcc( avi_t *a, char fcc[4] )
+{
+    fputc( fcc[0], a->f );
+    fputc( fcc[1], a->f );
+    fputc( fcc[2], a->f );
+    fputc( fcc[3], a->f );
+}
+
+/* Flags in avih */
+#define AVIF_HASINDEX       0x00000010  // Index at end of file?
+#define AVIF_ISINTERLEAVED  0x00000100
+#define AVIF_TRUSTCKTYPE    0x00000800  // Use CKType to find key frames?
+
+#define AVIIF_KEYFRAME      0x00000010L /* this frame is a key frame.*/
+
+void avi_write_header( avi_t *a )
+{
+    avi_write_fourcc( a, "RIFF" );
+    avi_write_uint32( a, a->i_riff > 0 ? a->i_riff - 8 : 0xFFFFFFFF );
+    avi_write_fourcc( a, "AVI " );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  4 + 4*16 + 12 + 4*16 + 4*12 );
+    avi_write_fourcc( a, "hdrl" );
+
+    avi_write_fourcc( a, "avih" );
+    avi_write_uint32( a, 4*16 - 8 );
+    avi_write_uint32( a, 1000000 / a->f_fps );
+    avi_write_uint32( a, 0xffffffff );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, AVIF_HASINDEX|AVIF_ISINTERLEAVED|AVIF_TRUSTCKTYPE);
+    avi_write_uint32( a, a->i_frame );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 1 );
+    avi_write_uint32( a, 1000000 );
+    avi_write_uint32( a, a->i_width );
+    avi_write_uint32( a, a->i_height );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  4 + 4*16 + 4*12 );
+    avi_write_fourcc( a, "strl" );
+
+    avi_write_fourcc( a, "strh" );
+    avi_write_uint32( a,  4*16 - 8 );
+    avi_write_fourcc( a, "vids" );
+    avi_write_fourcc( a, a->fcc );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 1000 );
+    avi_write_uint32( a, a->f_fps * 1000 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, a->i_frame );
+    avi_write_uint32( a, 1024*1024 );
+    avi_write_uint32( a, -1 );
+    avi_write_uint32( a, a->i_width * a->i_height );
+    avi_write_uint32( a, 0 );
+    avi_write_uint16( a, a->i_width );
+    avi_write_uint16( a, a->i_height );
+
+    avi_write_fourcc( a, "strf" );
+    avi_write_uint32( a,  4*12 - 8 );
+    avi_write_uint32( a,  4*12 - 8 );
+    avi_write_uint32( a,  a->i_width );
+    avi_write_uint32( a,  a->i_height );
+    avi_write_uint16( a,  1 );
+    avi_write_uint16( a,  24 );
+    avi_write_fourcc( a,  a->fcc );
+    avi_write_uint32( a, a->i_width * a->i_height );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  a->i_movi_end > 0 ? a->i_movi_end - a->i_movi + 4: 0xFFFFFFFF );
+    avi_write_fourcc( a, "movi" );
+}
+
+void avi_write_idx( avi_t *a )
+{
+    avi_write_fourcc( a, "idx1" );
+    avi_write_uint32( a,  a->i_frame * 16 );
+    fwrite( a->idx, a->i_frame * 16, 1, a->f );
+}
+
+void avi_init( avi_t *a, FILE *f, float f_fps, char fcc[4] )
+{
+    a->f = f;
+    a->f_fps = f_fps;
+    memcpy( a->fcc, fcc, 4 );
+    a->i_width = 0;
+    a->i_height = 0;
+    a->i_frame = 0;
+    a->i_movi = 0;
+    a->i_riff = 0;
+    a->i_movi_end = 0;
+    a->i_idx_max = 0;
+    a->idx = NULL;
+
+    avi_write_header( a );
+
+    a->i_movi = ftell( a->f );
+}
+
+static void avi_set_dw( void *_p, uint32_t dw )
+{
+    uint8_t *p = _p;
+
+    p[0] = ( dw      )&0xff;
+    p[1] = ( dw >> 8 )&0xff;
+    p[2] = ( dw >> 16)&0xff;
+    p[3] = ( dw >> 24)&0xff;
+}
+
+void avi_write( avi_t *a, vbuf_t *v, int b_key )
+{
+    int64_t i_pos = ftell( a->f );
+
+    /* chunk header */
+    avi_write_fourcc( a, "00dc" );
+    avi_write_uint32( a, v->i_data );
+
+    fwrite( v->p_data, v->i_data, 1, a->f );
+
+    if( v->i_data&0x01 )
+    {
+        /* pad */
+        fputc( 0, a->f );
+    }
+
+    /* Append idx chunk */
+    if( a->i_idx_max <= a->i_frame )
+    {
+        a->i_idx_max += 1000;
+        a->idx = realloc( a->idx, a->i_idx_max * 16 );
+    }
+
+    memcpy( &a->idx[4*a->i_frame+0], "00dc", 4 );
+    avi_set_dw( &a->idx[4*a->i_frame+1], b_key ? AVIIF_KEYFRAME : 0 );
+    avi_set_dw( &a->idx[4*a->i_frame+2], i_pos );
+    avi_set_dw( &a->idx[4*a->i_frame+3], v->i_data );
+
+    a->i_frame++;
+}
+
+void avi_end( avi_t *a )
+{
+    a->i_movi_end = ftell( a->f );
+
+    /* write index */
+    avi_write_idx( a );
+
+    a->i_riff = ftell( a->f );
+
+    /* Fix header */
+    fseek( a->f, 0, SEEK_SET );
+    avi_write_header( a );
+
+    fprintf( stderr, "avi file written\n" );
+    fprintf( stderr, "  - codec: %4.4s\n", a->fcc );
+    fprintf( stderr, "  - size: %dx%d\n", a->i_width, a->i_height );
+    fprintf( stderr, "  - fps: %.3f\n", a->f_fps );
+    fprintf( stderr, "  - frames: %d\n", a->i_frame );
+}
+
+/*****************************************************************************
+ * nal:
+ *****************************************************************************/
+int nal_decode( nal_t *nal, void *p_data, int i_data )
+{
+    uint8_t *src = p_data;
+    uint8_t *end = &src[i_data];
+    uint8_t *dst = nal->p_payload;
+
+    nal->i_type    = src[0]&0x1f;
+    nal->i_ref_idc = (src[0] >> 5)&0x03;
+
+    src++;
+
+    while( src < end )
+    {
+        if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00  && src[2] == 0x03 )
+        {
+            *dst++ = 0x00;
+            *dst++ = 0x00;
+
+            src += 3;
+            continue;
+        }
+        *dst++ = *src++;
+    }
+
+    nal->i_payload = dst - (uint8_t*)p_data;
+    return 0;
+}
+
diff --git a/tools/x264-rd.sh b/tools/x264-rd.sh
new file mode 100755
index 00000000..8a287b56
--- /dev/null
+++ b/tools/x264-rd.sh
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+X264="../x264"
+YUV="/usr/src/yuv/af-720x576.yuv"
+OUT="/tmp/x264-$$.h264"
+
+DAT="x264-rd.dat"
+
+OPTS="-c"
+
+# Init
+rm -f "$DAT"
+echo "#QP kb/s   PSNR Y     U     V     fps" > $DAT
+
+for qp in `seq 1 51`
+do
+    LOG="/tmp/x264-$qp-$$.log"
+    # clean
+    rm -f "$LOG"
+    # encode
+    $X264 "$YUV" -o "$OUT" --qp $qp $OPTS 2> "$LOG"
+    # gather stats
+    cat "$LOG" |
+    grep '^x264: overall' |
+    sed 's/^x264: overall PSNR Y:\([[:digit:]]*\.[[:digit:]]*\) U:\([[:digit:]]*\.[[:digit:]]*\) V:\([[:digit:]]*\.[[:digit:]]*\) kb\/s:\([[:digit:]]*\.[[:digit:]]*\) fps:\([[:digit:]]*\.[[:digit:]]*\)$/\1 \2 \3 \4 \5/g' |
+    awk -v QP=$qp '{ printf( "%2d %7.1f      %5.2f %5.2f %5.2f %5.3f\n", QP, $4, $1, $2, $3, $5 ); }' >> $DAT
+done
+
+# Clean
+rm -f "$OUT"
+rm -f "$LOG"
+
diff --git a/tools/xyuv.c b/tools/xyuv.c
new file mode 100644
index 00000000..0a5c0928
--- /dev/null
+++ b/tools/xyuv.c
@@ -0,0 +1,607 @@
+/*****************************************************************************
+ * xyuv.c: a SDL yuv 420 planer viewer.
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: xyuv.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <SDL/SDL.h>
+
+#define YUV_MAX 20
+#define SDL_TITLE "xyuv: %s - %d/%d - %.2ffps"
+typedef struct
+{
+    /* globals */
+    int     i_width;
+    int     i_height;
+    int     i_frame_size;
+    int     i_frame;
+    int     i_frames;
+    float   f_fps;
+
+    float   f_y;
+
+    int     b_pause;
+    int     b_grid;
+    int     b_split;
+    int     b_diff;
+    int     i_join;
+
+    /* Constructed picture */
+    int     i_wall_width;   /* in picture count */
+
+    /* YUV files */
+    int     i_yuv;
+    struct
+    {
+        char    *name;
+        FILE    *f;         /* handles */
+        int     i_frames;   /* frames count */
+
+        /* Position in the whole picture */
+        int     x, y;
+    } yuv[YUV_MAX];
+
+    /* SDL */
+    int i_sdl_width;
+    int i_sdl_height;
+
+    int i_display_width;
+    int i_display_height;
+    char *title;
+
+    SDL_Surface *screen;
+    SDL_Overlay *overlay;
+
+    /* */
+    uint8_t *pic;
+
+} xyuv_t;
+
+xyuv_t xyuv = {
+    .i_width = 0,
+    .i_height = 0,
+    .i_frame  = 1,
+    .i_frames = 0,
+    .f_fps = 25.0,
+    .f_y = 0.0,
+    .i_wall_width = 0,
+
+    .i_yuv = 0,
+
+    .b_pause = 0,
+    .b_split = 0,
+    .b_diff = 0,
+    .i_join = -1,
+
+    .title = NULL,
+    .pic = NULL,
+};
+
+static void help( void )
+{
+    fprintf( stderr,
+             "Syntax: xyuv [options] file [file2 ...]\n"
+             "\n"
+             "      --help                  Print this help\n"
+             "\n"
+             "  -s, --size <WIDTHxHEIGHT>   Set input size\n"
+             "  -w, --width <integer>       Set width\n"
+             "  -h, --height <integer>      Set height\n"
+             "\n"
+             "  -S, --split                 Show splited Y/U/V planes\n"
+             "  -d, --diff                  Show difference (only 2 files) in split mode\n"
+             "  -j, --joint <integer>\n"
+             "\n"
+             "  -y <float>                  Set Y factor\n"
+             "\n"
+             "  -g, --grid                  Show a grid (macroblock 16x16)\n"
+             "  -W <integer>                Set wall width (in picture count)\n"
+             "  -f, --fps <float>           Set fps\n"
+             "\n" );
+}
+
+
+static void xyuv_display( xyuv_t *xyuv, int i_frame );
+
+int main( int argc, char **argv )
+{
+    int i;
+
+    /* Parse commande line */
+    for( i = 1; i < argc; i++ ) {
+        if( !strcasecmp( argv[i], "--help" ) ) {
+            help();
+            return 0;
+        }
+        if( !strcmp( argv[i], "-d" ) || !strcasecmp( argv[i], "--diff" ) ) {
+            xyuv.b_diff = 1;
+        } else if( !strcmp( argv[i], "-S" ) || !strcasecmp( argv[i], "--split" ) ) {
+            xyuv.b_split = 1;
+        } else if( !strcmp( argv[i], "-f" ) || !strcasecmp( argv[i], "--fps" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.f_fps = atof( argv[++i] );
+        } else if( !strcmp( argv[i], "-h" ) || !strcasecmp( argv[i], "--height" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_height = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-w" ) || !strcasecmp( argv[i], "--width" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_width = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-s" ) || !strcasecmp( argv[i], "--size" ) ) {
+            char *p;
+
+            if( i >= argc -1 ) goto err_missing_arg;
+
+            xyuv.i_width = strtol( argv[++i], &p, 0 );
+            p++;
+            xyuv.i_height = atoi( p );
+        } else if( !strcmp( argv[i], "-W" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_wall_width = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-y" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.f_y = atof( argv[++i] );
+        } else if( !strcmp( argv[i], "-j" ) || !strcasecmp( argv[i], "--join" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_join = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-g" ) || !strcasecmp( argv[i], "--grid" ) ) {
+            xyuv.b_grid = 1;
+        } else {
+            FILE *f = fopen( argv[i], "rb" );
+            if( !f ) {
+                fprintf( stderr, "cannot open YUV %s\n", argv[i] );
+            } else {
+                xyuv.yuv[xyuv.i_yuv].name = strdup( argv[i] );
+                xyuv.yuv[xyuv.i_yuv].f = f;
+                xyuv.yuv[xyuv.i_yuv].i_frames = 0;
+
+                xyuv.i_yuv++;
+            }
+        }
+    }
+
+    if( xyuv.i_yuv == 0 ) {
+        fprintf( stderr, "no file to display\n" );
+        return -1;
+    }
+    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
+        char *psz = xyuv.yuv[0].name;
+        char *num;
+        char *x;
+        /* See if we find widthxheight in the file name */
+        for( ;; )
+        {
+            if( !( x = strchr( psz+1, 'x' ) ) )
+            {
+                break;
+            }
+            num = x;
+            while( num > psz && num[-1] >= '0' && num[-1] <= '9' )
+                num--;
+
+            if( num != x && x[1] >= '0' && x[1] <= '9' )
+            {
+                xyuv.i_width = atoi( num );
+                xyuv.i_height = atoi( x+1 );
+                break;
+            }
+            psz = x;
+        }
+        fprintf( stderr, "file name gives %dx%d\n", xyuv.i_width, xyuv.i_height );
+    }
+    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
+        fprintf( stderr, "invalid or missing frames size\n" );
+        return -1;
+    }
+    if( xyuv.b_diff && xyuv.i_yuv != 2 ) {
+        fprintf( stderr, "--diff works only with 2 files\n" );
+        return -1;
+    }
+    if( (xyuv.i_join == 0 || xyuv.i_join >= xyuv.i_width) && xyuv.i_yuv != 2 ) {
+        fprintf( stderr, "--join woeks only with two files and range is [1, width-1]\n" );
+        return -1;
+    }
+    if( xyuv.i_join % 2 != 0 ) {
+        if( xyuv.i_join + 1 < xyuv.i_width )
+            xyuv.i_join++;
+        else
+            xyuv.i_join--;
+    }
+
+    /* Now check frames */
+    fprintf( stderr, "displaying :\n" );
+    xyuv.i_frames = 0;
+    xyuv.i_frame_size = 3 * xyuv.i_width * xyuv.i_height / 2;
+    for( i = 0; i < xyuv.i_yuv; i++ ) {
+        /* Beurk but avoid using fstat */
+        fseek( xyuv.yuv[i].f, 0, SEEK_END );
+
+        xyuv.yuv[i].i_frames = ftell( xyuv.yuv[i].f ) / xyuv.i_frame_size;
+
+        fseek( xyuv.yuv[i].f, 0, SEEK_SET );
+
+        fprintf( stderr, " - '%s' : %d frames\n", xyuv.yuv[i].name, xyuv.yuv[i].i_frames );
+
+        if( xyuv.i_frames < xyuv.yuv[i].i_frames )
+            xyuv.i_frames = xyuv.yuv[i].i_frames;
+    }
+
+    if( xyuv.i_frames == 0 ) {
+        fprintf( stderr, "no frames to display\n" );
+    }
+
+    xyuv.pic = malloc( xyuv.i_frame_size );
+
+    /* calculate SDL view */
+    if( xyuv.i_wall_width > xyuv.i_yuv ) {
+        xyuv.i_wall_width = xyuv.i_yuv;
+    }
+    if( xyuv.i_wall_width == 0 ) {
+        while( xyuv.i_wall_width < xyuv.i_yuv && xyuv.i_wall_width * xyuv.i_wall_width < xyuv.i_yuv ) {
+            xyuv.i_wall_width++;
+        }
+    }
+
+    for( i = 0; i < xyuv.i_yuv; i++ ) {
+        if( xyuv.b_diff || xyuv.i_join > 0 ) {
+            xyuv.yuv[i].x = 0;
+            xyuv.yuv[i].y = 0;
+        } else if( xyuv.b_split ) {
+            xyuv.yuv[i].x = (i%xyuv.i_wall_width) * 3 * xyuv.i_width / 2;
+            xyuv.yuv[i].y = (i/xyuv.i_wall_width) * xyuv.i_height;
+        } else {
+            xyuv.yuv[i].x = (i%xyuv.i_wall_width) * xyuv.i_width;
+            xyuv.yuv[i].y = (i/xyuv.i_wall_width) * xyuv.i_height;
+        }
+    }
+    if( xyuv.b_diff ) {
+        xyuv.i_sdl_width = 3 * xyuv.i_width / 2;
+        xyuv.i_sdl_height= xyuv.i_height;
+    } else if( xyuv.i_join > 0 ) {
+        xyuv.i_sdl_width = xyuv.i_width;
+        xyuv.i_sdl_height= xyuv.i_height;
+    } else if( xyuv.b_split ) {
+        xyuv.i_sdl_width = xyuv.i_wall_width * 3 * xyuv.i_width / 2;
+        xyuv.i_sdl_height= xyuv.i_height * ( ( xyuv.i_yuv  + xyuv.i_wall_width - 1 ) / xyuv.i_wall_width );
+    } else {
+        xyuv.i_sdl_width = xyuv.i_wall_width * xyuv.i_width;
+        xyuv.i_sdl_height= xyuv.i_height * ( ( xyuv.i_yuv  + xyuv.i_wall_width - 1 ) / xyuv.i_wall_width );
+    }
+    xyuv.i_display_width = xyuv.i_sdl_width;
+    xyuv.i_display_height = xyuv.i_sdl_height;
+
+    /* Open SDL */
+    if( SDL_Init( SDL_INIT_EVENTTHREAD|SDL_INIT_NOPARACHUTE|SDL_INIT_VIDEO) ) {
+        fprintf( stderr, "cannot init SDL\n" );
+        return -1;
+    }
+
+    SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, 100 );
+    SDL_EventState( SDL_KEYUP, SDL_IGNORE );
+
+    xyuv.screen = SDL_SetVideoMode( xyuv.i_sdl_width, xyuv.i_sdl_height, 0,
+                                    SDL_HWSURFACE|SDL_RESIZABLE|
+                                    SDL_ASYNCBLIT|SDL_HWACCEL );
+    if( xyuv.screen == NULL ) {
+        fprintf( stderr, "SDL_SetVideoMode failed\n" );
+        return -1;
+    }
+
+    SDL_LockSurface( xyuv.screen );
+    xyuv.overlay = SDL_CreateYUVOverlay( xyuv.i_sdl_width, xyuv.i_sdl_height,
+                                         SDL_YV12_OVERLAY,
+                                         xyuv.screen );
+    /* reset with black */
+    memset( xyuv.overlay->pixels[0],   0, xyuv.overlay->pitches[0] * xyuv.i_sdl_height );
+    memset( xyuv.overlay->pixels[1], 128, xyuv.overlay->pitches[1] * xyuv.i_sdl_height / 2);
+    memset( xyuv.overlay->pixels[2], 128, xyuv.overlay->pitches[2] * xyuv.i_sdl_height / 2);
+    SDL_UnlockSurface( xyuv.screen );
+
+    if( xyuv.overlay == NULL ) {
+        fprintf( stderr, "recon: SDL_CreateYUVOverlay failed\n" );
+        return -1;
+    }
+
+    for( ;; ) {
+        SDL_Event event;
+        int64_t i_start = SDL_GetTicks();
+        int i_wait;
+
+        if( !xyuv.b_pause ) {
+            xyuv_display( &xyuv, xyuv.i_frame );
+        }
+
+        for( ;; ) {
+            int b_refresh = 0;
+            while( SDL_PollEvent( &event ) )  {
+                switch( event.type )
+                {
+                    case SDL_QUIT:
+                        exit( 1 );
+
+                    case SDL_KEYDOWN:
+                        switch( event.key.keysym.sym )
+                        {
+                            case SDLK_q:
+                            case SDLK_ESCAPE:
+                                exit(1);
+
+                            case SDLK_f:
+                                SDL_WM_ToggleFullScreen( xyuv.screen );
+                                break;
+
+                            case SDLK_g:
+                                if( xyuv.b_grid )
+                                    xyuv.b_grid = 0;
+                                else
+                                    xyuv.b_grid = 1;
+                                if( xyuv.b_pause )
+                                    b_refresh = 1;
+                                break;
+
+                            case SDLK_SPACE:
+                                if( xyuv.b_pause )
+                                    xyuv.b_pause = 0;
+                                else
+                                    xyuv.b_pause = 1;
+                                break;
+                            case SDLK_LEFT:
+                                if( xyuv.i_frame > 1 ) xyuv.i_frame--;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_RIGHT:
+                                if( xyuv.i_frame < xyuv.i_frames ) xyuv.i_frame++;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_HOME:
+                                xyuv.i_frame = 1;
+                                if( xyuv.b_pause )
+                                    b_refresh = 1;
+                                break;
+
+                            case SDLK_END:
+                                xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_UP:
+                                xyuv.i_frame += xyuv.i_frames / 20;
+                                if( xyuv.i_frame > xyuv.i_frames )
+                                    xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_DOWN:
+                                xyuv.i_frame -= xyuv.i_frames / 20;
+                                if( xyuv.i_frame < 1 )
+                                    xyuv.i_frame = 1;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_PAGEUP:
+                                xyuv.i_frame += xyuv.i_frames / 10;
+                                if( xyuv.i_frame > xyuv.i_frames )
+                                    xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_PAGEDOWN:
+                                xyuv.i_frame -= xyuv.i_frames / 10;
+                                if( xyuv.i_frame < 1 )
+                                    xyuv.i_frame = 1;
+                                b_refresh = 1;
+                                break;
+
+                            default:
+                                break;
+                        }
+                        break;
+                    case SDL_VIDEORESIZE:
+                        xyuv.i_display_width = event.resize.w;
+                        xyuv.i_display_height = event.resize.h;
+                        xyuv.screen = SDL_SetVideoMode( xyuv.i_display_width, xyuv.i_display_height, 0,
+                                                        SDL_HWSURFACE|SDL_RESIZABLE|
+                                                        SDL_ASYNCBLIT|SDL_HWACCEL );
+                        xyuv_display( &xyuv, xyuv.i_frame );
+                        break;
+
+                    default:
+                        break;
+                }
+            }
+            if( b_refresh ) {
+                xyuv.b_pause = 1;
+                xyuv_display( &xyuv, xyuv.i_frame );
+            }
+            /* wait */
+            i_wait = 1000 / xyuv.f_fps - ( SDL_GetTicks() - i_start);
+            if( i_wait < 0 )
+                break;
+            else if( i_wait > 200 )
+                SDL_Delay( 200 );
+            else {
+                SDL_Delay( i_wait );
+                break;
+            }
+        }
+        if( !xyuv.b_pause ) {
+            /* next frame */
+            if( xyuv.i_frame == xyuv.i_frames )
+                xyuv.b_pause = 1;
+            else if( xyuv.i_frame < xyuv.i_frames )
+                xyuv.i_frame++;
+        }
+    }
+
+
+    return 0;
+
+err_missing_arg:
+    fprintf( stderr, "missing arg for option=%s\n", argv[i] );
+    return -1;
+}
+
+
+static void xyuv_display( xyuv_t *xyuv, int i_frame )
+{
+    SDL_Rect rect;
+    int i_picture = 0;
+    int i;
+
+    if( i_frame > xyuv->i_frames )
+        return;
+
+    xyuv->i_frame = i_frame;
+
+    /* Load and copy pictue data */
+    for( i = 0; i < xyuv->i_yuv; i++ ) {
+        int i_plane;
+
+        if( i_frame - 1 >= xyuv->yuv[i].i_frames )
+            continue;
+        i_picture++;
+
+        fseek( xyuv->yuv[i].f, (xyuv->i_frame-1) * xyuv->i_frame_size, SEEK_SET );
+        fread( xyuv->pic, xyuv->i_frame_size, 1, xyuv->yuv[i].f );
+
+        SDL_LockYUVOverlay( xyuv->overlay );
+
+        if( xyuv->b_diff || xyuv->b_split ) {
+            /* Reset UV */
+            for( i_plane = 1; i_plane < 3; i_plane++ ) {
+                memset( xyuv->overlay->pixels[i_plane], 128, xyuv->overlay->pitches[i_plane] * xyuv->overlay->h / 2 );
+            }
+            /* Show diff in Y plane of overlay */
+
+            for( i_plane = 0; i_plane < 3; i_plane++ ) {
+                int div = i_plane == 0 ? 1 : 2;
+                uint8_t *src = xyuv->pic;
+                uint8_t *dst = xyuv->overlay->pixels[0] +
+                                (xyuv->yuv[i].x + xyuv->yuv[i].y * xyuv->overlay->pitches[0] );
+                int j;
+                if( i_plane == 1 ) {
+                    src +=  5*xyuv->i_width * xyuv->i_height/4;
+                    dst += xyuv->i_width;
+                } else if( i_plane == 2 ) {
+                    src += xyuv->i_width * xyuv->i_height;
+                    dst += xyuv->i_width + xyuv->i_height / 2 * xyuv->overlay->pitches[0];
+                }
+
+                for( j = 0; j < xyuv->i_height / div; j++ ) {
+                    if( i_picture == 1 || xyuv->b_split ) {
+                        memcpy( dst, src, xyuv->i_width / div );
+                    } else {
+                        int k;
+                        for( k = 0; k < xyuv->i_width / div; k++ ) {
+                            dst[k] = abs( dst[k] - src[k]);
+                        }
+                    }
+                    src += xyuv->i_width / div;
+                    dst += xyuv->overlay->pitches[0];
+                }
+            }
+        } else {
+            for( i_plane = 0; i_plane < 3; i_plane++ ) {
+                int div = i_plane == 0 ? 1 : 2;
+                uint8_t *src = xyuv->pic;
+                uint8_t *dst = xyuv->overlay->pixels[i_plane] +
+                                ((xyuv->yuv[i].x + xyuv->yuv[i].y * xyuv->overlay->pitches[i_plane] ) / div );
+                int w = xyuv->i_width / div;
+                int j;
+
+                if( i_plane == 1 ) {
+                    src +=  5*xyuv->i_width * xyuv->i_height/4;
+                } else if( i_plane == 2 ) {
+                    src += xyuv->i_width * xyuv->i_height;
+                }
+                if( xyuv->i_join > 0 ) {
+                    if( i_picture > 1 ) {
+                        src += xyuv->i_join / div;
+                        dst += xyuv->i_join / div;
+                        w = (xyuv->i_width - xyuv->i_join) /div;
+                    } else {
+                        w = xyuv->i_join / div;
+                    }
+                }
+
+                for( j = 0; j < xyuv->i_height / div; j++ ) {
+                    memcpy( dst, src, w );
+                    src += xyuv->i_width / div;
+                    dst += xyuv->overlay->pitches[i_plane];
+                }
+            }
+        }
+
+        SDL_UnlockYUVOverlay( xyuv->overlay );
+    }
+
+    if( xyuv->f_y != 0.0 ) {
+        uint8_t *pix = xyuv->overlay->pixels[0];
+        int j;
+
+        for( j = 0; j < xyuv->i_sdl_height; j++ ) {
+            int k;
+            for( k = 0; k < xyuv->i_sdl_width; k++ ) {
+                int v= pix[k] * xyuv->f_y;
+                if( v > 255 )
+                    pix[k] = 255;
+                else if( v < 0 )
+                    pix[k] = 0;
+                else
+                    pix[k] = v;
+            }
+            pix += xyuv->overlay->pitches[0];
+        }
+    }
+    if( xyuv->b_grid ) {
+        int x, y;
+
+        for( y = 0; y < xyuv->i_sdl_height; y += 4 ) {
+            uint8_t *p = xyuv->overlay->pixels[0] + y * xyuv->overlay->pitches[0];
+            for( x = 0; x < xyuv->i_sdl_width; x += 4 ) {
+                if( x%16== 0 || y%16 == 0 )
+                    p[x] = 0;
+            }
+        }
+    }
+
+    /* Update display */
+    rect.x = 0;
+    rect.y = 0;
+    rect.w = xyuv->i_display_width;
+    rect.h = xyuv->i_display_height;
+    SDL_DisplayYUVOverlay( xyuv->overlay, &rect );
+
+    /* Display title */
+    if( xyuv->title )
+        free( xyuv->title );
+    asprintf( &xyuv->title, SDL_TITLE, xyuv->yuv[0].name, xyuv->i_frame, xyuv->i_frames, xyuv->f_fps );
+    SDL_WM_SetCaption( xyuv->title, "" );
+}
+
+
+
+
diff --git a/vfw/build/cygwin/Makefile b/vfw/build/cygwin/Makefile
new file mode 100644
index 00000000..e5f37468
--- /dev/null
+++ b/vfw/build/cygwin/Makefile
@@ -0,0 +1,117 @@
+##############################################################################
+#
+# Makefile for x264 VFW driver
+#
+# Author: XviD project:
+#            - ??? <cutka at szm.sk>,
+#            - Edouard Gomez <ed.gomez at free.fr>
+#            - Peter Ross <pross@xvid.org>
+# Ported to x264 by Laurent Aimar <fenrir@via.ecp.fr>
+#
+# $Id: Makefile,v 1.1 2004/06/03 19:29:33 fenrir Exp $
+##############################################################################
+
+# Dll to build
+DLL=x264vfw.dll
+
+# Current dir
+DIR_CUR=$(shell pwd)
+
+# Path to include filen library and src
+DIR_INC=$(DIR_CUR)/../../..
+DIR_LIB=$(DIR_CUR)/../../..
+DIR_SRC=$(DIR_CUR)/../..
+
+# Sources
+SRC_C= codec.c config.c driverproc.c
+SRC_RES= resource.rc
+
+# Alias
+RM= rm -rf
+WINDRES=windres
+
+##############################################################################
+# CFLAGS
+##############################################################################
+
+# Constants which should not be modified
+# The `mingw-runtime` package is required when building with -mno-cygwin
+CFLAGS += -I$(DIR_SRC)/w32api -I$(DIR_INC)
+CFLAGS += -D_WIN32_IE=0x0500
+CFLAGS += -mno-cygwin
+
+# Optional Compiler options
+CFLAGS += -Wall
+CFLAGS += -O2
+CFLAGS += -fstrength-reduce
+CFLAGS += -finline-functions
+CFLAGS += -fgcse
+CFLAGS += -freduce-all-givs
+CFLAGS += -ffast-math
+
+##############################################################################
+# Compiler flags for linking stage
+##############################################################################
+
+LDFLAGS += -L$(DIR_LIB) -lx264
+
+##############################################################################
+# Rules
+##############################################################################
+
+OBJECTS = $(SRC_C:.c=.obj)
+OBJECTS+= $(SRC_RES:.rc=.obj)
+
+.SUFFIXES: .obj .rc .c
+
+DIR_BUILD= $(DIR_CUR)/bin
+VPATH = $(DIR_SRC):$(DIR_BUILD)
+
+all: $(DLL)
+
+$(DIR_BUILD):
+	@echo " D: $(DIR_BUILD)"
+	@mkdir -p $(DIR_BUILD)
+
+.rc.obj:
+	@echo " W: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(WINDRES) \
+	--include-dir=$(DIR_SRC) \
+	--input-format=rc \
+	--output-format=coff \
+	-o $(DIR_BUILD)/$@ $<
+
+.c.obj:
+	@echo " C: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(CC) $(CFLAGS) -c -o $(DIR_BUILD)/$@ $<
+
+$(DLL): $(DIR_BUILD) $(OBJECTS)
+	@echo " L: $(@F)"
+	@cp $(DIR_SRC)/driverproc.def $(DIR_BUILD)/driverproc.def
+	@cd $(DIR_BUILD) && \
+	$(CC) \
+	-mno-cygwin -shared -Wl,-dll,--out-implib,$@.a,--enable-stdcall-fixup \
+	-o $@ \
+	$(OBJECTS) driverproc.def \
+	-lgdi32 -lwinmm -lcomdlg32 -lcomctl32 $(LDFLAGS) 
+
+clean:
+	@echo " Cl: Object files and target lib"
+	@$(RM) $(DIR_BUILD)
+
+install:
+	@echo " I: x264vfw.dll"
+	@cp $(DIR_BUILD)/$(DLL) $(DLL)
+	@cp $(DIR_SRC)/build/win32/bin/x264vfw.inf .
+	@rundll32.exe setupapi,InstallHinfSection DefaultInstall 132 ./x264vfw.inf
+	@rm $(DLL)
+	@rm x264vfw.inf
+
+uninstall:
+	@echo " U: x264vfw.dll"
+	@cp $(DIR_SRC)/build/win32/bin/x264vfw.inf .
+	@rundll32.exe setupapi,InstallHinfSection Remove_x264 132 ./x264vfw.inf
+	@rm x264vfw.inf
+
diff --git a/vfw/build/win32/bin/x264vfw.inf b/vfw/build/win32/bin/x264vfw.inf
new file mode 100644
index 00000000..d4ff64a1
--- /dev/null
+++ b/vfw/build/win32/bin/x264vfw.inf
@@ -0,0 +1,91 @@
+; x264 Codec install
+
+[Version]
+Signature = "$CHICAGO$"
+Class = MEDIA
+
+[SourceDisksNames]
+1="x264 Codec Install Disk",, 0001
+
+[SourceDisksFiles]
+x264vfw.dll=1
+x264vfw.inf=1
+
+[Installable.Drivers]
+x264 = 1:x264vfw.dll, "vidc.X264", "x264 H.264 Video Codec" , , ,
+
+[DefaultInstall]
+CopyFiles=H264.Copy.Inf,H264.Copy
+Updateinis = H264.Updateini
+DelReg = H264.DelConfig
+addreg = H264.AddReg,H264.AddReg9x,H264.DoReg
+MediaType = SOFTWARE
+
+[DefaultInstall.ntx86]
+CopyFiles=H264.Copy.Inf,H264.Copy
+DelReg = H264.DelConfig
+addreg = H264.AddReg,H264.AddRegNT,H264.DoReg
+MediaType = SOFTWARE
+
+[Remove_x264]
+AddReg = H264.Unregister
+DelReg = H264.DelReg
+DelFiles = H264.Copy,H264.Copy.Inf
+UpdateInis = H264.DelIni
+
+[H264.Copy]
+x264vfw.dll
+
+[H264.Copy.Inf]
+x264vfw.inf
+
+[H264.UpdateIni]
+system.ini, drivers32,,"vidc.X264=x264vfw.dll"
+
+[H264.DelIni]
+system.ini, drivers32,"vidc.X264=x264vfw.dll",
+
+[H264.AddReg]
+
+[H264.AddReg9x]
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,Description,,%x264%
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,Driver,,x264vfw.dll
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,FriendlyName,,"x264"
+
+HKLM,%UnInstallPath%,DisplayName,,%UninstallDispName%
+HKLM,%UnInstallPath%,UninstallString,,"%10%\rundll.exe setupx.dll,InstallHinfSection Remove_x264 132 %17%\%InfFile%"
+
+[H264.AddRegNT]
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers.desc,x264vfw.dll,,%x264%
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers32,vidc.X264,,x264vfw.dll
+
+HKLM,%UnInstallPath%,DisplayName,,%UninstallDispName%
+HKLM,%UnInstallPath%,UninstallString,,"%11%\rundll32.exe setupapi,InstallHinfSection Remove_x264 132 %17%\%InfFile%"
+
+[H264.DoReg]
+;HKLM,Software\Microsoft\Windows\CurrentVersion\RunOnce\Setup,"Registering x264 Direct Show ;Decoder...",,"%11%\regsvr32.exe /s %11%\x264.ax"
+
+[H264.DelReg]
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264
+
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers.desc,x264vfw.dll,,""
+HKLM,%UnInstallPath%
+
+[H264.Unregister]
+;HKLM,Software\Microsoft\Windows\CurrentVersion\RunOnce\Setup,"Unregistering x264 Direct Show ;Decoder...",,"%11%\regsvr32.exe /s /u %11%\x264.ax"
+
+[H264.DelConfig]
+HKCU,Software\GNU\x264
+
+[DestinationDirs]
+DefaultDestDir = 11	; LDID_SYS
+H264.Copy = 11
+H264.Copy.Inf = 17
+
+[Strings]
+x264="x264 H.264 Video Codec"
+InfFile="x264vfw.inf"
+UninstallDispName="x264 H.264/AVC CODEC"
+UnInstallPath="Software\Microsoft\Windows\CurrentVersion\Uninstall\x264"
+MediaClassName="Media Devices"
+mfgname="Fenrir, Justin, CM"
diff --git a/vfw/build/win32/x264vfw.dsp b/vfw/build/win32/x264vfw.dsp
new file mode 100644
index 00000000..0fadf913
--- /dev/null
+++ b/vfw/build/win32/x264vfw.dsp
@@ -0,0 +1,135 @@
+# Microsoft Developer Studio Project File - Name="x264vfw" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Dynamic-Link Library" 0x0102
+
+CFG=x264vfw - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "x264vfw.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "x264vfw.mak" CFG="x264vfw - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "x264vfw - Win32 Release" (based on "Win32 (x86) Dynamic-Link Library")
+!MESSAGE "x264vfw - Win32 Debug" (based on "Win32 (x86) Dynamic-Link Library")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+MTL=midl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "x264vfw - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "obj/Release"
+# PROP Intermediate_Dir "obj/Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /c
+# ADD CPP /nologo /MT /W3 /GX /O2 /I "../../../extras" /I "../../.." /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /c
+# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32
+# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:I386
+# ADD LINK32 winmm.lib vfw32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:I386 /out:"bin/x264vfw.dll"
+
+!ELSEIF  "$(CFG)" == "x264vfw - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "obj/Debug"
+# PROP Intermediate_Dir "obj/Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /GZ /c
+# ADD CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /I "../../../extras" /I "../../.." /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /GZ /c
+# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
+# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 winmm.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /out:"bin/x264vfw.dll" /pdbtype:sept
+
+!ENDIF 
+
+# Begin Target
+
+# Name "x264vfw - Win32 Release"
+# Name "x264vfw - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\codec.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\config.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\driverproc.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\driverproc.def
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\resource.rc
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\resource.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\x264vfw.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# Begin Source File
+
+SOURCE=..\..\..\build\win32\bin\libx264.lib
+# End Source File
+# End Target
+# End Project
diff --git a/vfw/build/win32/x264vfw.dsw b/vfw/build/win32/x264vfw.dsw
new file mode 100644
index 00000000..7d24ef97
--- /dev/null
+++ b/vfw/build/win32/x264vfw.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "x264vfw"=.\x264vfw.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/vfw/codec.c b/vfw/codec.c
new file mode 100644
index 00000000..076d7962
--- /dev/null
+++ b/vfw/codec.c
@@ -0,0 +1,276 @@
+/*****************************************************************************
+ * codec.c: vfw x264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: codec.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "x264vfw.h"
+
+/* get_csp:
+ *  return a valid x264 CSP or X264_CSP_NULL if unsuported */
+static int get_csp( BITMAPINFOHEADER *hdr )
+{
+    int i_vlip = hdr->biHeight < 0 ? 0 : X264_CSP_VFLIP;
+
+    switch( hdr->biCompression )
+    {
+        case FOURCC_I420:
+        case FOURCC_IYUV:
+            return X264_CSP_I420;
+
+        case FOURCC_YV12:
+            return X264_CSP_YV12;
+
+        case FOURCC_YUYV:
+        case FOURCC_YUY2:
+            return X264_CSP_YUYV;
+
+        case BI_RGB:
+            if( hdr->biBitCount == 24 )
+                return X264_CSP_BGR | i_vlip;
+            if( hdr->biBitCount == 32 )
+                return X264_CSP_BGRA | i_vlip;
+            else
+                return X264_CSP_NONE;
+
+        default:
+            return X264_CSP_NONE;
+    }
+}
+
+/* Test that we can do the compression */
+LRESULT compress_query( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    BITMAPINFOHEADER *inhdr = &lpbiInput->bmiHeader;
+    BITMAPINFOHEADER *outhdr = &lpbiOutput->bmiHeader;
+    CONFIG           *config = &codec->config;
+
+    if( get_csp( inhdr ) == X264_CSP_NONE )
+        return ICERR_BADFORMAT;
+
+    if( lpbiOutput == NULL )
+        return ICERR_OK;
+
+    if( inhdr->biWidth != outhdr->biWidth ||
+        inhdr->biHeight != outhdr->biHeight )
+        return ICERR_BADFORMAT;
+
+    /* We need x16 width/height */
+    if( inhdr->biWidth % 16 != 0 || inhdr->biHeight % 16 != 0 )
+        return ICERR_BADFORMAT;
+
+
+    if( inhdr->biCompression != mmioFOURCC( config->fcc[0], config->fcc[1],
+                                            config->fcc[2], config->fcc[3] ) )
+        return ICERR_BADFORMAT;
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_get_format( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    BITMAPINFOHEADER *inhdr = &lpbiInput->bmiHeader;
+    BITMAPINFOHEADER *outhdr = &lpbiOutput->bmiHeader;
+    CONFIG           *config = &codec->config;
+
+    if( get_csp( inhdr ) == X264_CSP_NONE )
+        return ICERR_BADFORMAT;
+
+    if( lpbiOutput == NULL )
+        return sizeof(BITMAPINFOHEADER);
+
+    memcpy( outhdr, inhdr, sizeof( BITMAPINFOHEADER ) );
+    outhdr->biSize = sizeof( BITMAPINFOHEADER );
+    outhdr->biSizeImage = compress_get_size( codec, lpbiInput, lpbiOutput );
+    outhdr->biXPelsPerMeter = 0;
+    outhdr->biYPelsPerMeter = 0;
+    outhdr->biClrUsed = 0;
+    outhdr->biClrImportant = 0;
+    outhdr->biCompression = mmioFOURCC( config->fcc[0], config->fcc[1],
+                                        config->fcc[2], config->fcc[3] );
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_get_size( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    return 2 * lpbiOutput->bmiHeader.biWidth * lpbiOutput->bmiHeader.biHeight * 3;
+}
+
+/* */
+LRESULT compress_frames_info(CODEC * codec, ICCOMPRESSFRAMES * icf )
+{
+    codec->fincr = icf->dwScale;
+    codec->fbase = icf->dwRate;
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_begin(CODEC * codec, BITMAPINFO * lpbiInput, BITMAPINFO * lpbiOutput )
+{
+    CONFIG *config = &codec->config;
+    x264_param_t param;
+
+    /* Destroy previous handle */
+    if( codec->h != NULL )
+    {
+        x264_encoder_close( codec->h );
+        codec->h = NULL;
+    }
+
+    /* Get default param */
+    x264_param_default( &param );
+
+    /* Set params: TODO to complete */
+    param.i_width = lpbiInput->bmiHeader.biWidth;
+    param.i_height= lpbiInput->bmiHeader.biHeight;
+
+    if( codec->fbase > 0 )
+        param.f_fps   = (float)codec->fincr / (float)codec->fbase;
+
+    param.i_frame_reference = config->i_refmax;
+    param.i_idrframe = config->i_idrframe;
+    param.i_iframe   = config->i_iframe;
+    param.i_qp_constant = config->i_qp;
+    param.b_deblocking_filter = config->b_filter;
+    param.b_cabac = config->b_cabac;
+
+    param.analyse.intra = 0;
+    param.analyse.inter = 0;
+    if( config->b_psub16x16 )
+        param.analyse.inter |= X264_ANALYSE_PSUB16x16;
+    if( config->b_psub8x8 )
+        param.analyse.inter |= X264_ANALYSE_PSUB8x8;
+    if( config->b_i4x4 )
+    {
+        param.analyse.intra |= X264_ANALYSE_I4x4;
+        param.analyse.inter |= X264_ANALYSE_I4x4;
+    }
+
+    switch( config->mode )
+    {
+        case 0: /* 1 PASS */
+            break;
+        default:
+            break;
+    }
+
+    /* Open the encoder */
+    codec->h = x264_encoder_open( &param );
+    if( codec->h == NULL )
+        return ICERR_ERROR;
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_end(CODEC * codec)
+{
+    if( codec->h != NULL )
+    {
+        x264_encoder_close( codec->h );
+        codec->h = NULL;
+    }
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress( CODEC *codec, ICCOMPRESS *icc )
+{
+    BITMAPINFOHEADER *inhdr = icc->lpbiInput;
+    BITMAPINFOHEADER *outhdr = icc->lpbiOutput;
+
+    x264_picture_t pic;
+
+    int        i_nal;
+    x264_nal_t *nal;
+    int        i_out;
+
+    int i;
+
+    /* Init the picture */
+    memset( &pic, 0, sizeof( x264_picture_t ) );
+    pic.img.i_csp = get_csp( inhdr );
+
+    /* For now biWidth can be divided by 16 so no problem */
+    switch( pic.img.i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+        case X264_CSP_YV12:
+            pic.img.i_plane = 3;
+            pic.img.i_stride[0] = inhdr->biWidth;
+            pic.img.i_stride[1] =
+            pic.img.i_stride[2] = inhdr->biWidth / 2;
+
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            pic.img.plane[1]    = pic.img.plane[0] + inhdr->biWidth * inhdr->biHeight;
+            pic.img.plane[2]    = pic.img.plane[1] + inhdr->biWidth * inhdr->biHeight / 4;
+            break;
+
+        case X264_CSP_YUYV:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 2 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        case X264_CSP_BGR:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 3 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        case X264_CSP_BGRA:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 4 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        default:
+            return ICERR_BADFORMAT;
+    }
+
+    /* encode it */
+    x264_encoder_encode( codec->h, &nal, &i_nal, &pic );
+
+    /* create bitstream */
+    i_out = 0;
+    for( i = 0; i < i_nal; i++ )
+    {
+        int i_size = outhdr->biSizeImage - i_out;
+        x264_nal_encode( (uint8_t*)icc->lpOutput + i_out, &i_size, 1, &nal[i] );
+
+        i_out += i_size;
+    }
+    outhdr->biSizeImage = i_out;
+
+    /* Set key frame only for IDR, as they are real synch point, I frame
+       aren't always synch point (ex: with multi refs, ref marking) */
+    if( pic.i_type == X264_TYPE_IDR )
+        *icc->lpdwFlags = AVIIF_KEYFRAME;
+    else
+        *icc->lpdwFlags = 0;
+
+    return ICERR_OK;
+}
+
diff --git a/vfw/config.c b/vfw/config.c
new file mode 100644
index 00000000..79f5a7ba
--- /dev/null
+++ b/vfw/config.c
@@ -0,0 +1,443 @@
+/*****************************************************************************
+ * config.c: vfw x264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: config.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/**************************************************************************
+ *
+ *  History:
+ *
+ *  2004.05.14  CBR encode mode support
+ *
+ **************************************************************************/
+
+#include "x264vfw.h"
+#include <stdio.h>  /* sprintf */
+#include <commctrl.h>
+
+/* Registry */
+#define X264_REG_KEY	HKEY_CURRENT_USER
+#define X264_REG_PARENT "Software\\GNU\\x264"
+#define X264_REG_CHILD  "x264"
+#define X264_REG_CLASS  "config"
+
+/* window controls */
+#define BITRATE_MAX		5000
+#define QUANT_MAX		51
+
+/* description */
+#define X264_NAME		"x264"
+#define X264_DEF_TEXT	"Are you sure you want to load default vaules"
+
+/* Registery handling */
+typedef struct
+{
+    char *reg_value;
+    int  *config_int;
+    int  default_int;
+} reg_int_t;
+
+typedef struct
+{
+    char *reg_value;
+    char *config_str;
+    char *default_str;
+} reg_str_t;
+
+CONFIG reg;
+static const reg_int_t reg_int_table[] =
+{
+    /* Main dialog */
+    { "bitrate",        &reg.bitrate,           800 },
+    { "quantizer",      &reg.i_qp,              26 },
+    { "encoding_type",  &reg.i_encoding_type,   1 },
+
+    /* Advance dialog */
+    { "cabac",          &reg.b_cabac,           1 },
+    { "loop_filter",    &reg.b_filter,          1 },
+    { "idrframe",       &reg.i_idrframe,        1 },
+    { "iframe",         &reg.i_iframe,          150 },
+    { "refmax",         &reg.i_refmax,          1 },
+
+    /* analysis */
+    {"i4x4",            &reg.b_i4x4,            1 },
+    {"psub16x16",       &reg.b_psub16x16,       1 },
+    {"psub8x8",         &reg.b_psub8x8,         1 }
+};
+
+static const reg_str_t reg_str_table[] =
+{
+    { "fourcc",         reg.fcc,                "x264" }
+};
+
+void config_reg_load( CONFIG *config )
+{
+    HKEY    hKey;
+    DWORD   i_size;
+    int     i;
+
+    RegOpenKeyEx( X264_REG_KEY, X264_REG_PARENT "\\" X264_REG_CHILD,
+                  0, KEY_READ, &hKey );
+
+    /* Read all integers */
+    for( i = 0; i < sizeof( reg_int_table )/sizeof( reg_int_t); i++ )
+    {
+        i_size = sizeof( int );
+        if( RegQueryValueEx( hKey, reg_int_table[i].reg_value, 0, 0,
+                             (LPBYTE)reg_int_table[i].config_int,
+                             &i_size ) != ERROR_SUCCESS )
+            *reg_int_table[i].config_int = reg_int_table[i].default_int;
+    }
+
+    /* Read strings */
+    for( i = 0; i < sizeof( reg_str_table )/sizeof( reg_str_t); i++ )
+    {
+        i_size = 5;   /* fourcc + 1 FIXME ugly */
+        if( RegQueryValueEx( hKey, reg_str_table[i].reg_value, 0, 0,
+                             (LPBYTE)reg_str_table[i].config_str,
+                             &i_size ) != ERROR_SUCCESS )
+            memcpy( reg_str_table[i].config_str,
+                    reg_str_table[i].default_str, 5 );
+    }
+
+    RegCloseKey( hKey );
+
+    memcpy( config, &reg, sizeof( CONFIG ) );
+}
+
+void config_reg_save( CONFIG *config )
+{
+    HKEY    hKey;
+    DWORD   i_size;
+    int     i;
+
+    if( RegCreateKeyEx( X264_REG_KEY,
+                        X264_REG_PARENT "\\" X264_REG_CHILD,
+                        0,
+                        X264_REG_CLASS,
+                        REG_OPTION_NON_VOLATILE,
+                        KEY_WRITE,
+                        0, &hKey, &i_size ) != ERROR_SUCCESS )
+        return;
+
+    memcpy( &reg, config, sizeof( CONFIG ) );
+
+    /* Save all integers */
+    for( i = 0; i < sizeof( reg_int_table )/sizeof( reg_int_t); i++ )
+    {
+        RegSetValueEx( hKey, reg_int_table[i].reg_value, 0, REG_DWORD,
+                       (LPBYTE)reg_int_table[i].config_int, sizeof( int ) );
+    }
+
+    /* Save strings */
+    for( i = 0; i < sizeof( reg_str_table )/sizeof( reg_str_t); i++ )
+    {
+        RegSetValueEx( hKey, reg_str_table[i].reg_value, 0, REG_SZ,
+                       (LPBYTE)reg_str_table[i].config_str,
+                        5 );    /* FIXME */
+    }
+
+    RegCloseKey( hKey );
+}
+
+/* config_reg_defaults: */
+void config_reg_defaults( CONFIG *config )
+{
+    HKEY hKey;
+
+    /* Just in case */
+    memset( config, 0, sizeof( CONFIG ) );
+
+    if(RegOpenKeyEx( X264_REG_KEY, X264_REG_PARENT, 0, KEY_ALL_ACCESS, &hKey ))
+        return;
+    if( RegDeleteKey( hKey, X264_REG_CHILD ) )
+        return;
+    RegCloseKey( hKey );
+
+    config_reg_load( config );
+    config_reg_save( config );
+}
+
+/* Enables or Disables Window Elements based on Selection
+ */
+static void main_enable_item( HWND hDlg, CONFIG * config )
+{
+    switch( config->i_encoding_type )
+    {
+    case 0 : /* 1 Pass, Bitrate Based */
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATEEDIT ), TRUE );
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATESLIDER ), TRUE );
+
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTEDIT ), FALSE );
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTSLIDER ), FALSE );
+
+        break;
+    case 1 : /* 1 Pass, Quantizer Based */
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATEEDIT ), FALSE );
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATESLIDER ), FALSE );
+
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTEDIT ), TRUE );
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTSLIDER ), TRUE );
+
+        break;
+    case 2 : /* 2 Pass */
+        /* not yet implemented */
+        break;
+    }
+
+    SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETRANGE, TRUE,
+                        (LPARAM) MAKELONG( 0, BITRATE_MAX ) );
+    SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETRANGE, TRUE,
+                        (LPARAM) MAKELONG( 0, QUANT_MAX ) );
+}
+
+/* Updates the window from config */
+static void main_update_dlg( HWND hDlg, CONFIG * config )
+{
+    SetDlgItemInt( hDlg, IDC_BITRATEEDIT, config->bitrate, FALSE );
+    SetDlgItemInt( hDlg, IDC_QUANTEDIT, config->i_qp, FALSE );
+
+    switch( config->i_encoding_type )
+    {
+    case 0 : /* 1 Pass, Bitrate Based */
+        CheckRadioButton( hDlg,
+                          IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOBITRATE);
+        break;
+    case 1 : /* 1 Pass, Quantizer Based */
+        CheckRadioButton(hDlg,
+                         IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOQUANT);
+            break;
+    case 2 : /* 2 Pass */
+        CheckRadioButton(hDlg,
+                         IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOTWOPASS);
+        break;
+    }
+
+    SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETPOS, TRUE,
+                        config->bitrate );
+    SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETPOS, TRUE,
+                        config->i_qp );
+}
+
+
+/* Main config dialog */
+BOOL CALLBACK callback_main( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    CONFIG* config = (CONFIG*)GetWindowLong(hDlg, GWL_USERDATA);
+
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+        SetWindowLong( hDlg, GWL_USERDATA, lParam );
+        config = (CONFIG*)lParam;
+
+        main_enable_item( hDlg, config );
+        main_update_dlg( hDlg, config );
+
+        break;
+
+    case WM_COMMAND:
+        switch ( HIWORD( wParam ) )
+        {
+        case BN_CLICKED :
+            switch( LOWORD( wParam ) )
+            {
+            case IDOK :
+                config->b_save = TRUE;
+                EndDialog( hDlg, LOWORD(wParam) );
+                break;
+            case IDCANCEL :
+                config->b_save = FALSE;
+                EndDialog( hDlg, LOWORD(wParam) );
+                break;
+            case IDC_ADVANCED :
+                DialogBoxParam( g_hInst, MAKEINTRESOURCE(IDD_ADVANCED),
+                                (HWND)lParam, callback_advanced,
+                                (LPARAM)config );
+                break;
+            case IDC_DEFAULTS :
+                if( MessageBox( hDlg, X264_DEF_TEXT, X264_NAME, MB_YESNO ) == IDYES )
+                {
+                    config_reg_defaults( config );
+                    main_enable_item( hDlg, config );
+                    main_update_dlg( hDlg, config );
+                }
+                break;
+            case IDC_RADIOBITRATE :
+                config->i_encoding_type = 0; /* 1 Pass, Bitrate Mode=0 */
+                main_enable_item( hDlg, config );
+                main_update_dlg( hDlg, config );
+                break;
+            case IDC_RADIOQUANT :
+                config->i_encoding_type = 1; /* 1 Pass, Quantizer Mode=1 */
+                main_enable_item( hDlg, config );
+                main_update_dlg( hDlg, config );
+                break;
+            case IDC_RADIOTWOPASS :
+                config->i_encoding_type = 2; /* 2 Pass Mode=2 */
+                main_enable_item( hDlg,  config );
+                main_update_dlg( hDlg, config );
+                break;
+            }
+            break;
+        case EN_CHANGE :
+            switch( LOWORD( wParam ) )
+            {
+            case IDC_BITRATEEDIT :
+                config->bitrate = GetDlgItemInt( hDlg, IDC_BITRATEEDIT, FALSE, FALSE );
+                SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETPOS, TRUE, config->bitrate );
+                break;
+            case IDC_QUANTEDIT :
+                config->i_qp = GetDlgItemInt( hDlg, IDC_QUANTEDIT, FALSE, FALSE );
+                SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETPOS, TRUE, config->i_qp );
+                break;
+            }
+            break;
+        default:
+            break;
+        }
+        break;
+
+        case WM_HSCROLL :
+            if( (HWND) lParam == GetDlgItem( hDlg, IDC_BITRATESLIDER ) )
+            {
+                config->bitrate = SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_GETPOS, 0, 0 );
+                SetDlgItemInt( hDlg, IDC_BITRATEEDIT, config->bitrate, FALSE );
+
+            }
+            else if( (HWND) lParam == GetDlgItem( hDlg, IDC_QUANTSLIDER ) )
+            {
+                config->i_qp = SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_GETPOS, 0, 0 );
+                SetDlgItemInt( hDlg, IDC_QUANTEDIT, config->i_qp, FALSE );
+            }
+            break;
+
+    default :
+        return 0;
+    }
+
+    return 1;
+}
+
+/* About dialog */
+BOOL CALLBACK callback_about( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+    {
+        char temp[1024];
+        sprintf( temp, "Core %d, build %s %s", X264_BUILD, __DATE__, __TIME__ );
+        SetDlgItemText( hDlg, IDC_BUILD,  temp );
+        break;
+    }
+
+    case WM_COMMAND:
+        if (LOWORD(wParam) == IDC_HOMEPAGE && HIWORD(wParam) == STN_CLICKED)
+            ShellExecute( hDlg, "open", X264_WEBSITE, NULL, NULL, SW_SHOWNORMAL );
+        else if (LOWORD(wParam) == IDOK || LOWORD(wParam) == IDCANCEL)
+            EndDialog( hDlg, LOWORD(wParam) );
+        break;
+
+    default :
+        return 0;
+    }
+
+    return 1;
+}
+
+static void adv_update_dlg( HWND hDlg, CONFIG * config )
+{
+    char fourcc[5];
+
+    CheckDlgButton( hDlg,IDC_CABAC,
+                    config->b_cabac ? BST_CHECKED : BST_UNCHECKED );
+    CheckDlgButton( hDlg,IDC_LOOPFILTER,
+                    config->b_filter ? BST_CHECKED: BST_UNCHECKED );
+
+    SetDlgItemInt( hDlg, IDC_IDRFRAMES, config->i_idrframe, FALSE );
+    SetDlgItemInt( hDlg, IDC_IFRAMES, config->i_iframe, FALSE );
+    SetDlgItemInt( hDlg, IDC_KEYFRAME, config->i_refmax, FALSE );
+
+    memcpy( fourcc, config->fcc, 4 );
+    fourcc[4] = '\0';
+
+    SetDlgItemText( hDlg, IDC_FOURCC, fourcc );
+}
+
+
+/* advanced configuration dialog process */
+BOOL CALLBACK callback_advanced( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    CONFIG* config = (CONFIG*)GetWindowLong(hDlg, GWL_USERDATA);
+
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+        SetWindowLong( hDlg, GWL_USERDATA, lParam );
+        config = (CONFIG*)lParam;
+
+        adv_update_dlg( hDlg, config );
+        break;
+
+    case WM_COMMAND:
+        switch ( HIWORD( wParam ) )
+        {
+        case BN_CLICKED :
+            switch( LOWORD( wParam ) )
+            {
+            case IDOK :
+                EndDialog( hDlg, LOWORD( wParam ) );
+                break;
+            case IDC_CABAC :
+                config->b_cabac = ( IsDlgButtonChecked( hDlg, IDC_CABAC ) == BST_CHECKED );
+                break;
+            case IDC_LOOPFILTER :
+                config->b_filter = ( IsDlgButtonChecked( hDlg, IDC_LOOPFILTER ) == BST_CHECKED );
+                break;
+            }
+            break;
+        case EN_CHANGE :
+            switch( LOWORD( wParam ) )
+            {
+            case IDC_IDRFRAMES :
+                config->i_idrframe = GetDlgItemInt( hDlg, IDC_IDRFRAMES, FALSE, FALSE );
+                break;
+            case IDC_IFRAMES :
+                config->i_iframe = GetDlgItemInt( hDlg, IDC_IFRAMES, FALSE, FALSE );
+                break;
+            case IDC_KEYFRAME :
+                config->i_refmax = GetDlgItemInt( hDlg, IDC_KEYFRAME, FALSE, FALSE );
+                break;
+            case IDC_FOURCC :
+                GetDlgItemText( hDlg, IDC_FOURCC, config->fcc, 5 );
+                break;
+            }
+            break;
+        }
+        break;
+    default :
+        return 0;
+    }
+    return 1;
+}
+
diff --git a/vfw/driverproc.c b/vfw/driverproc.c
new file mode 100644
index 00000000..4a050e68
--- /dev/null
+++ b/vfw/driverproc.c
@@ -0,0 +1,234 @@
+/*****************************************************************************
+ * drvproc.c: vfw x264 wrapper
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: driverproc.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "x264vfw.h"
+
+/* Global dll instance */
+HINSTANCE g_hInst;
+
+
+/* Calling back point for our DLL so we can keep track of the window in g_hInst */
+BOOL WINAPI DllMain( HANDLE hModule, DWORD ul_reason_for_call, LPVOID lpReserved )
+{
+    g_hInst = (HINSTANCE) hModule;
+    return TRUE;
+}
+
+/* This little puppy handles the calls which vfw programs send out to the codec */
+LRESULT WINAPI DriverProc( DWORD dwDriverId, HDRVR hDriver, UINT uMsg, LPARAM lParam1, LPARAM lParam2 )
+{
+    CODEC *codec = (CODEC *)dwDriverId;
+
+    switch( uMsg )
+    {
+        case DRV_LOAD:
+        case DRV_FREE:
+            return DRV_OK;
+
+        case DRV_OPEN:
+        {
+            ICOPEN *icopen = (ICOPEN *)lParam2;
+
+            if( icopen != NULL && icopen->fccType != ICTYPE_VIDEO )
+                return DRV_CANCEL;
+
+            if( ( codec = malloc( sizeof( CODEC ) ) ) == NULL )
+            {
+                if( icopen != NULL )
+                    icopen->dwError = ICERR_MEMORY;
+                return 0;
+            }
+
+            memset( codec, 0, sizeof( CODEC ) );
+            config_reg_load( &codec->config );
+            codec->h = NULL;
+
+            if( icopen != NULL )
+                icopen->dwError = ICERR_OK;
+            return (LRESULT)codec;
+        }
+
+        case DRV_CLOSE:
+            /* From xvid: compress_end/decompress_end don't always get called */
+            compress_end(codec);
+            free( codec );
+            return DRV_OK;
+
+        case DRV_DISABLE:
+        case DRV_ENABLE:
+            return DRV_OK;
+
+        case DRV_INSTALL:
+        case DRV_REMOVE:
+            return DRV_OK;
+
+        case DRV_QUERYCONFIGURE:
+        case DRV_CONFIGURE:
+            return DRV_CANCEL;
+
+        /* info */
+        case ICM_GETINFO:
+        {
+            ICINFO *icinfo = (ICINFO *)lParam1;
+
+            /* return a description */
+            icinfo->fccType      = ICTYPE_VIDEO;
+            icinfo->fccHandler   = FOURCC_X264;
+            icinfo->dwFlags      = VIDCF_COMPRESSFRAMES | VIDCF_FASTTEMPORALC;
+
+            icinfo->dwVersion    = 0;
+            icinfo->dwVersionICM = ICVERSION;
+
+            wcscpy( icinfo->szName, X264_NAME_L);
+            wcscpy( icinfo->szDescription, X264_DESC_L);
+
+            return lParam2; /* size of struct */
+        }
+
+        case ICM_ABOUT:
+            if( lParam1 != -1 )
+            {
+                DialogBoxParam(g_hInst, MAKEINTRESOURCE(IDD_ABOUT), (HWND)lParam1, callback_about, 0 );
+            }
+            return ICERR_OK;
+
+        case ICM_CONFIGURE:
+            if( lParam1 != -1 )
+            {
+                CONFIG temp;
+
+                codec->config.b_save = FALSE;
+			    memcpy( &temp, &codec->config, sizeof(CONFIG) );
+
+                DialogBoxParam( g_hInst, MAKEINTRESOURCE(IDD_MAINCONFIG), (HWND)lParam1, callback_main, (LPARAM)&temp );
+
+			    if( temp.b_save )
+			    {
+				    memcpy( &codec->config, &temp, sizeof(CONFIG) );
+                    config_reg_save( &codec->config );
+			    }
+            }
+            return ICERR_OK;
+
+        case ICM_GETSTATE:
+            if( (void*)lParam1 == NULL )
+            {
+                return sizeof( CONFIG );
+            }
+            memcpy( (void*)lParam1, &codec->config, sizeof( CONFIG ) );
+            return ICERR_OK;
+
+        case ICM_SETSTATE:
+            if( (void*)lParam1 == NULL )
+            {
+                config_reg_load( &codec->config );
+                return 0;
+            }
+            memcpy( &codec->config, (void*)lParam1, sizeof( CONFIG ) );
+            return 0;
+
+        /* not sure the difference, private/public data? */
+        case ICM_GET:
+        case ICM_SET:
+            return ICERR_OK;
+
+
+        /* older-stype config */
+        case ICM_GETDEFAULTQUALITY:
+        case ICM_GETQUALITY:
+        case ICM_SETQUALITY:
+        case ICM_GETBUFFERSWANTED:
+        case ICM_GETDEFAULTKEYFRAMERATE:
+            return ICERR_UNSUPPORTED;
+
+
+        /* compressor */
+        case ICM_COMPRESS_QUERY:
+            return compress_query(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_GET_FORMAT:
+            return compress_get_format(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_GET_SIZE:
+            return compress_get_size(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_FRAMES_INFO:
+            return compress_frames_info(codec, (ICCOMPRESSFRAMES *)lParam1);
+
+        case ICM_COMPRESS_BEGIN:
+            return compress_begin(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_END:
+            return compress_end(codec);
+
+        case ICM_COMPRESS:
+            return compress(codec, (ICCOMPRESS *)lParam1);
+
+        /* decompressor : not implemented */
+        case ICM_DECOMPRESS_QUERY:
+        case ICM_DECOMPRESS_GET_FORMAT:
+        case ICM_DECOMPRESS_BEGIN:
+        case ICM_DECOMPRESS_END:
+        case ICM_DECOMPRESS:
+        case ICM_DECOMPRESS_GET_PALETTE:
+        case ICM_DECOMPRESS_SET_PALETTE:
+        case ICM_DECOMPRESSEX_QUERY:
+        case ICM_DECOMPRESSEX_BEGIN:
+        case ICM_DECOMPRESSEX_END:
+        case ICM_DECOMPRESSEX:
+            return ICERR_UNSUPPORTED;
+
+#if 0
+        /* VFWEXT entry point : XXX what's that ? */
+        case ICM_USER+0x0fff :
+            if (lParam1 == VFWEXT_CONFIGURE_INFO) {
+                VFWEXT_CONFIGURE_INFO_T * info = (VFWEXT_CONFIGURE_INFO_T*)lParam2;
+                DPRINTF("%i %i %i %i %i %i",
+                    info->ciWidth, info->ciHeight,
+                    info->ciRate, info->ciScale,
+                    info->ciActiveFrame, info->ciFrameCount);
+
+                codec->config.ci_valid = 1;
+                memcpy(&codec->config.ci, (void*)lParam2, sizeof(VFWEXT_CONFIGURE_INFO_T));
+                return ICERR_OK;
+            }
+            return ICERR_UNSUPPORTED;
+#endif
+
+        default:
+            return DefDriverProc( dwDriverId, hDriver, uMsg, lParam1, lParam2 );
+    }
+}
+
+void WINAPI Configure(HWND hwnd, HINSTANCE hinst, LPTSTR lpCmdLine, int nCmdShow)
+{
+    DWORD dwDriverId;
+
+    dwDriverId = DriverProc(0, 0, DRV_OPEN, 0, 0);
+    if (dwDriverId != (DWORD)NULL)
+    {
+        DriverProc(dwDriverId, 0, ICM_CONFIGURE, (LPARAM)GetDesktopWindow(), 0);
+        DriverProc(dwDriverId, 0, DRV_CLOSE, 0, 0);
+    }
+}
diff --git a/vfw/resource.h b/vfw/resource.h
new file mode 100644
index 00000000..df693805
--- /dev/null
+++ b/vfw/resource.h
@@ -0,0 +1,52 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Developer Studio generated include file.
+// Used by resource.rc
+//
+#define IDD_DIALOG1                     101
+#define IDD_MAINCONFIG                  101
+#define IDD_ADVANCED                    102
+#define IDD_ABOUT                       103
+#define IDC_BITRATESLIDER               1002
+#define IDC_BITRATEEDIT                 1003
+#define IDC_BITRATESLIDER2              1004
+#define IDC_QUANTSLIDER                 1004
+#define IDC_CABAC                       1005
+#define IDC_BITRATEEDIT2                1005
+#define IDC_QUANTEDIT                   1005
+#define IDC_LOOPFILTER                  1007
+#define IDC_BITRATELOW                  1009
+#define IDC_BITRATELOW2                 1010
+#define IDC_BITRATEHIGH                 1011
+#define IDC_BFRAMES                     1012
+#define IDC_BITRATEHIGH2                1012
+#define IDC_IDRFRAMES                   1012
+#define IDC_BFRAMES2                    1013
+#define IDC_IFRAMES                     1013
+#define IDC_EDIT3                       1014
+#define IDC_KEYFRAME                    1014
+#define IDC_DEFAULTS                    1016
+#define IDC_CHECK3                      1017
+#define IDC_ADVANCED                    1018
+#define IDC_RADIO1                      1022
+#define IDC_RADIOBITRATE                1022
+#define IDC_MODE                        1023
+#define IDC_RADIOQUALITY                1024
+#define IDC_RADIOQUANT                  1024
+#define IDC_RADIOTWOPASS                1026
+#define IDC_USEADVANCED                 1029
+#define IDC_ADVDEFAULTS                 1030
+#define IDC_HOMEPAGE                    1034
+#define IDC_X264                        1035
+#define IDC_BUILD                       1036
+#define IDC_FOURCC                      1039
+
+// Next default values for new objects
+// 
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        104
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1040
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
diff --git a/vfw/x264vfw.h b/vfw/x264vfw.h
new file mode 100644
index 00000000..36e36cf3
--- /dev/null
+++ b/vfw/x264vfw.h
@@ -0,0 +1,103 @@
+#ifndef _X264_VFW_H
+#define _X264_VFW_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <windows.h>
+#include <vfw.h>
+
+#include <x264.h>
+
+#include "resource.h"
+
+/* Name */
+#define X264_NAME_L     L"x264"
+#define X264_DESC_L     L"x264 - H264/AVC encoder"
+
+/* Codec fcc */
+#define FOURCC_X264 mmioFOURCC('X','2','6','4')
+
+/* yuv 4:2:0 planar */
+#define FOURCC_I420 mmioFOURCC('I','4','2','0')
+#define FOURCC_IYUV mmioFOURCC('I','Y','U','V')
+#define FOURCC_YV12 mmioFOURCC('Y','V','1','2')
+
+/* yuv 4:2:2 packed */
+#define FOURCC_YUY2 mmioFOURCC('Y','U','Y','2')
+#define FOURCC_YUYV mmioFOURCC('Y','U','Y','V')
+
+#define X264_WEBSITE	"http://lyra.via.ecp.fr/"
+
+/* CONFIG: vfw config
+ */
+typedef struct
+{
+    /********** ATTENTION **********/
+    int mode;                   /* Vidomi directly accesses these vars */
+    int bitrate;
+    int desired_size;           /* please try to avoid modifications here */
+    char stats[MAX_PATH];
+    /*******************************/
+
+    /* Our config */
+    int i_refmax;
+    int i_idrframe;
+    int i_iframe;
+
+    int i_qp;
+    int b_filter;
+
+    int b_cabac;
+
+    int b_i4x4;
+    int b_psub16x16;
+    int b_psub8x8;
+
+    /* vfw interface */
+    int b_save;
+    /* fourcc used */
+    char fcc[4+1];
+    int  i_encoding_type;
+} CONFIG;
+
+/* CODEC: vfw codec instance
+ */
+typedef struct
+{
+    CONFIG config;
+
+    /* handle */
+    x264_t *h;
+
+    /* XXX: needed ? */
+    unsigned int fincr;
+    unsigned int fbase;
+} CODEC;
+
+/* Compress functions */
+LRESULT compress_query(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_get_format(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_get_size(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_frames_info(CODEC *, ICCOMPRESSFRAMES *);
+LRESULT compress_begin(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_end(CODEC *);
+LRESULT compress(CODEC *, ICCOMPRESS *);
+
+
+/* config functions */
+void config_reg_load( CONFIG * config );
+void config_reg_save( CONFIG * config );
+
+
+/* Dialog callbacks */
+BOOL CALLBACK callback_about( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+BOOL CALLBACK callback_main ( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+BOOL CALLBACK callback_advanced( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+
+/* Dll instance */
+extern HINSTANCE g_hInst;
+
+#endif
+
diff --git a/x264.c b/x264.c
new file mode 100644
index 00000000..75c347b6
--- /dev/null
+++ b/x264.c
@@ -0,0 +1,558 @@
+/*****************************************************************************
+ * x264: h264 encoder/decoder testing program.
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: x264.c,v 1.1 2004/06/03 19:24:12 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <math.h>
+
+#include <signal.h>
+#define _GNU_SOURCE
+#include <getopt.h>
+
+#ifdef _MSC_VER
+#include <io.h>     /* _setmode() */
+#include <fcntl.h>  /* _O_BINARY */
+#endif
+
+#include "x264.h"
+#include "core/common.h"
+
+#define DATA_MAX 3000000
+uint8_t data[DATA_MAX];
+
+/* Ctrl-C handler */
+static int     i_ctrl_c = 0;
+static void    SigIntHandler( int a )
+{
+    i_ctrl_c = 1;
+}
+
+static void Help( void );
+static int  Parse( int argc, char **argv, x264_param_t  *param, FILE **p_fin, FILE **p_fout, int *pb_decompress );
+static int  Encode( x264_param_t  *param, FILE *fyuv,  FILE *fout );
+static int  Decode( x264_param_t  *param, FILE *fh26l, FILE *fout );
+
+/****************************************************************************
+ * main:
+ ****************************************************************************/
+int main( int argc, char **argv )
+{
+    x264_param_t param;
+
+    FILE    *fout;
+    FILE    *fin;
+
+    int     b_decompress;
+    int     i_ret;
+
+#ifdef _MSC_VER
+    _setmode(_fileno(stdin), _O_BINARY);    /* thanks to Marcos Morais <morais at dee.ufcg.edu.br> */
+    _setmode(_fileno(stdout), _O_BINARY);
+#endif
+
+    x264_param_default( &param );
+    param.f_fps = 25.0;
+
+    /* Parse command line */
+    if( Parse( argc, argv, &param, &fin, &fout, &b_decompress ) < 0 )
+    {
+        return -1;
+    }
+
+    /* Control-C handler */
+    signal( SIGINT, SigIntHandler );
+
+    if( b_decompress )
+        i_ret = Decode( &param, fin, fout );
+    else
+        i_ret = Encode( &param, fin, fout );
+
+    return i_ret;
+}
+
+/*****************************************************************************
+ * Help:
+ *****************************************************************************/
+static void Help( void )
+{
+    fprintf( stderr,
+             "x264 build:0x%4.4x\n"
+             "Syntax: x264 [options] [-o out.h26l] in.yuv widthxheigh\n"
+             "\n"
+             "  -h, --help                  Print this help\n"
+             "\n"
+             "  -I, --idrframe <integer>    Each 'number' I frames are IDR frames\n"
+             "  -i, --iframe <integer>      Frequency of I frames\n"
+             "  -b, --bframe <integer>      Number of B-frames between I and P\n"
+             "\n"
+             "  -c, --cabac                 Enable CABAC\n"
+             "  -r, --ref <integer>         Number of references\n"
+             "  -n, --nf                    Disable loop filter\n"
+             "  -f, --filter <alpha:beta>   Loop filter AplhaCO and Beta parameters\n"
+             "  -q, --qp <integer>          Set QP\n"
+             "  -B, --bitrate <integer>     Set bitrate [broken]\n"
+             "\n"
+             "  -A, --analyse <string>      Analyse options:\n"
+             "                                  - i4x4\n"
+             "                                  - psub16x16,psub8x8\n"
+             "                                  - none, all\n"
+             "\n"
+             "  -s, --sar width:height      Specify Sample Aspect Ratio\n"
+             "  -o, --output                Specify output file\n"
+             "\n"
+             "      --no-asm                Disable any CPU optims\n"
+             "\n",
+            X264_BUILD
+           );
+}
+
+/*****************************************************************************
+ * Parse:
+ *****************************************************************************/
+static int  Parse( int argc, char **argv,
+                   x264_param_t  *param,
+                   FILE **p_fin, FILE **p_fout, int *pb_decompress )
+{
+    char *psz_filename = NULL;
+
+    /* Default output */
+    *p_fout = stdout;
+    *p_fin  = stdin;
+    *pb_decompress = 0;
+
+    /* Parse command line options */
+    opterr = 0; // no error message
+    for( ;; )
+    {
+        int long_options_index;
+        static struct option long_options[] =
+        {
+            { "help",    no_argument,       NULL, 'h' },
+            { "bitrate", required_argument, NULL, 'B' },
+            { "bframe",  required_argument, NULL, 'b' },
+            { "iframe",  required_argument, NULL, 'i' },
+            { "idrframe",required_argument, NULL, 'I' },
+            { "nf",      no_argument,       NULL, 'n' },
+            { "filter",  required_argument, NULL, 'f' },
+            { "cabac",   no_argument,       NULL, 'c' },
+            { "qp",      required_argument, NULL, 'q' },
+            { "ref",     required_argument, NULL, 'r' },
+            { "no-asm",  no_argument,       NULL, 'C' },
+            { "sar",     required_argument, NULL, 's' },
+            { "output",  required_argument, NULL, 'o' },
+            { "analyse", required_argument, NULL, 'A' },
+            {0, 0, 0, 0}
+        };
+
+        int c;
+
+        c = getopt_long( argc, argv, "hi:I:b:r:cxB:q:no:s:A:",
+                         long_options, &long_options_index);
+
+        if( c == -1 )
+        {
+            break;
+        }
+
+        switch( c )
+        {
+            case 'h':
+                Help();
+                return -1;
+
+            case 0:
+                break;
+            case 'B':
+                param->i_bitrate = atol( optarg );
+                break;
+            case 'b':
+                param->i_bframe = atol( optarg );
+                break;
+            case 'i':
+                param->i_iframe = atol( optarg );
+                break;
+            case 'I':
+                param->i_idrframe = atol( optarg );
+                break;
+            case 'n':
+                param->b_deblocking_filter = 0;
+                break;
+            case 'f':
+            {
+                char *p = strchr( optarg, ':' );
+                if( p )
+                {
+                    param->i_deblocking_filter_alphac0 = atoi( optarg );
+                    param->i_deblocking_filter_beta = atoi( p );
+                }
+                break;
+            }
+            case 'q':
+                param->i_qp_constant = atoi( optarg );
+                break;
+            case 'r':
+                param->i_frame_reference = atoi( optarg );
+                break;
+            case 'c':
+                param->b_cabac = 1;
+                break;
+            case 'x':
+                *pb_decompress = 1;
+                break;
+            case 'C':
+                param->cpu = 0;
+                break;
+            case'o':
+                if( ( *p_fout = fopen( optarg, "wb" ) ) == NULL )
+                {
+                    fprintf( stderr, "cannot open output file `%s'\n", optarg );
+                    return -1;
+                }
+                break;
+            case 's':
+            {
+                char *p = strchr( optarg, ':' );
+                if( p )
+                {
+                    param->vui.i_sar_width = atoi( optarg );
+                    param->vui.i_sar_height = atoi( p + 1 );
+                }
+                break;
+            }
+            case 'A':
+                param->analyse.inter = 0;
+                if( strstr( optarg, "none" ) )  param->analyse.inter = 0x000000;
+                if( strstr( optarg, "all" ) )   param->analyse.inter = X264_ANALYSE_I4x4|X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8;
+
+                if( strstr( optarg, "i4x4" ) )      param->analyse.inter |= X264_ANALYSE_I4x4;
+                if( strstr( optarg, "psub16x16" ) ) param->analyse.inter |= X264_ANALYSE_PSUB16x16;
+                if( strstr( optarg, "psub8x8" ) )   param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+                break;
+
+            default:
+                fprintf( stderr, "unknown option (%c)\n", optopt );
+                return -1;
+        }
+    }
+
+    /* Get the file name */
+    if( optind > argc - 1 )
+    {
+        Help();
+        return -1;
+    }
+    psz_filename = argv[optind++];
+
+    if( !(*pb_decompress) )
+    {
+        char *psz_size = NULL;
+        char *p;
+
+
+        if( optind > argc - 1 )
+        {
+            char *psz = psz_filename;
+            char *x = NULL;
+            /* try to parse the file name */
+            while( *psz )
+            {
+                while( *psz && ( *psz < '0' || *psz > '9' ) ) psz++;
+                x = strchr( psz, 'x' );
+                if( !x )
+                    break;
+                if( ( x[1] >= '0' && x[1] <= '9' ) )
+                {
+                    psz_size = psz;
+                    break;
+                }
+            }
+            if( psz_size == NULL )
+            {
+                Help();
+                return -1;
+            }
+            fprintf( stderr, "x264: file name gives %dx%d\n", atoi(psz), atoi(x+1) );
+        }
+        else
+        {
+            psz_size = argv[optind++];
+        }
+
+        param->i_width           = strtol( psz_size, &p, 0 );
+        param->i_height          = strtol( p+1, &p, 0 );
+    }
+
+    /* open the input */
+    if( !strcmp( psz_filename, "-" ) )
+    {
+        *p_fin = stdin;
+        optind++;
+    }
+    else if( ( *p_fin = fopen( psz_filename, "rb" ) ) == NULL )
+    {
+        fprintf( stderr, "could not open input file '%s'\n", psz_filename );
+        return -1;
+    }
+
+    return 0;
+}
+
+/*****************************************************************************
+ * Decode:
+ *****************************************************************************/
+static int  Decode( x264_param_t  *param, FILE *fh26l, FILE *fout )
+{
+    fprintf( stderr, "decompressor not working (help is welcome)\n" );
+    return -1;
+#if 0
+    x264_nal_t nal;
+    int i_data;
+    int b_eof;
+
+    //param.cpu = 0;
+    if( ( h = x264_decoder_open( &param ) ) == NULL )
+    {
+        fprintf( stderr, "x264_decoder_open failed\n" );
+        return -1;
+    }
+
+    i_start = x264_mdate();
+    b_eof = 0;
+    i_frame = 0;
+    i_data  = 0;
+    nal.p_payload = malloc( DATA_MAX );
+
+    while( !i_ctrl_c )
+    {
+        uint8_t *p, *p_next, *end;
+        int i_size;
+        /* fill buffer */
+        if( i_data < DATA_MAX && !b_eof )
+        {
+            int i_read = fread( &data[i_data], 1, DATA_MAX - i_data, fh26l );
+            if( i_read <= 0 )
+            {
+                b_eof = 1;
+            }
+            else
+            {
+                i_data += i_read;
+            }
+        }
+
+        if( i_data < 3 )
+        {
+            break;
+        }
+
+        end = &data[i_data];
+
+        /* extract one nal */
+        p = &data[0];
+        while( p < end - 3 )
+        {
+            if( p[0] == 0x00 && p[1] == 0x00 && p[2] == 0x01 )
+            {
+                break;
+            }
+            p++;
+        }
+
+        if( p >= end - 3 )
+        {
+            fprintf( stderr, "garbage (i_data = %d)\n", i_data );
+            i_data = 0;
+            continue;
+        }
+
+        p_next = p + 3;
+        while( p_next < end - 3 )
+        {
+            if( p_next[0] == 0x00 && p_next[1] == 0x00 && p_next[2] == 0x01 )
+            {
+                break;
+            }
+            p_next++;
+        }
+
+        if( p_next == end - 3 && i_data < DATA_MAX )
+        {
+            p_next = end;
+        }
+
+        /* decode this nal */
+        i_size = p_next - p - 3;
+        if( i_size <= 0 )
+        {
+            if( b_eof )
+            {
+                break;
+            }
+            fprintf( stderr, "nal too large (FIXME) ?\n" );
+            i_data = 0;
+            continue;
+        }
+
+        x264_nal_decode( &nal, p +3, i_size );
+
+        /* decode the content of the nal */
+        x264_decoder_decode( h, &pic, &nal );
+
+        if( pic != NULL )
+        {
+            int i;
+
+            i_frame++;
+
+            for( i = 0; i < pic->i_plane;i++ )
+            {
+                int i_line;
+                int i_div;
+
+                i_div = i==0 ? 1 : 2;
+                for( i_line = 0; i_line < pic->i_height/i_div; i_line++ )
+                {
+                    fwrite( pic->plane[i]+i_line*pic->i_stride[i], 1, pic->i_width/i_div, fout );
+                }
+            }
+        }
+
+        memmove( &data[0], p_next, end - p_next );
+        i_data -= p_next - &data[0];
+    }
+
+    i_end = x264_mdate();
+    free( nal.p_payload );
+    fprintf( stderr, "\n" );
+
+    x264_decoder_close( h );
+
+    fclose( fh26l );
+    if( fout != stdout )
+    {
+        fclose( fout );
+    }
+    if( i_frame > 0 )
+    {
+        double fps = (double)i_frame * (double)1000000 /
+                     (double)( i_end - i_start );
+        fprintf( stderr, "decoded %d frames %ffps\n", i_frame, fps );
+    }
+#endif
+}
+
+/*****************************************************************************
+ * Encode:
+ *****************************************************************************/
+static int  Encode( x264_param_t  *param, FILE *fyuv, FILE *fout )
+{
+    x264_t *h;
+    x264_picture_t pic;
+
+    int     i_frame, i_frame_total;
+    int64_t i_start, i_end;
+    int64_t i_file;
+
+    i_frame_total = 0;
+    if( !fseek( fyuv, 0, SEEK_END ) )
+    {
+        int64_t i_size = ftell( fyuv );
+        fseek( fyuv, 0, SEEK_SET );
+        i_frame_total = (int)(i_size / ( param->i_width * param->i_height * 3 / 2 ));
+    }
+
+    if( ( h = x264_encoder_open( param ) ) == NULL )
+    {
+        fprintf( stderr, "x264_encoder_open failed\n" );
+        return -1;
+    }
+
+    /* Create a new pic */
+    x264_picture_alloc( &pic, X264_CSP_I420, param->i_width, param->i_height );
+
+    i_start = x264_mdate();
+    for( i_frame = 0, i_file = 0; i_ctrl_c == 0 ; i_frame++ )
+    {
+        int         i_nal;
+        x264_nal_t  *nal;
+
+        int         i;
+
+        /* read a frame */
+        if( fread( pic.img.plane[0], 1, param->i_width * param->i_height, fyuv ) <= 0 ||
+            fread( pic.img.plane[1], 1, param->i_width * param->i_height / 4, fyuv ) <= 0 ||
+            fread( pic.img.plane[2], 1, param->i_width * param->i_height / 4, fyuv ) <= 0 )
+        {
+            break;
+        }
+
+        /* Do not force any parameters */
+        pic.i_type = X264_TYPE_AUTO;
+        if( x264_encoder_encode( h, &nal, &i_nal, &pic ) < 0 )
+        {
+            fprintf( stderr, "x264_encoder_encode failed\n" );
+        }
+
+        for( i = 0; i < i_nal; i++ )
+        {
+            int i_size;
+            int i_data;
+
+            i_data = DATA_MAX;
+            if( ( i_size = x264_nal_encode( data, &i_data, 1, &nal[i] ) ) > 0 )
+            {
+                i_file += fwrite( data, 1, i_size, fout );
+            }
+            else if( i_size < 0 )
+            {
+                fprintf( stderr,
+                         "need to increase buffer size (size=%d)\n", -i_size );
+            }
+        }
+    }
+    i_end = x264_mdate();
+    x264_picture_clean( &pic );
+    x264_encoder_close( h );
+    fprintf( stderr, "\n" );
+
+    fclose( fyuv );
+    if( fout != stdout )
+    {
+        fclose( fout );
+    }
+
+    if( i_frame > 0 )
+    {
+        double fps = (double)i_frame * (double)1000000 /
+                     (double)( i_end - i_start );
+
+        fprintf( stderr, "encoded %d frames %ffps %lld kb/s\n", i_frame, fps, i_file * 8 * 25 / i_frame / 1000 );
+    }
+
+    return 0;
+}
+
+
diff --git a/x264.h b/x264.h
new file mode 100644
index 00000000..772c6c7f
--- /dev/null
+++ b/x264.h
@@ -0,0 +1,249 @@
+/*****************************************************************************
+ * x264.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: x264.h,v 1.1 2004/06/03 19:24:12 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _X264_H
+#define _X264_H 1
+
+#define X264_BUILD 0x0008
+
+/* x264_t:
+ *      opaque handler for decoder and encoder */
+typedef struct x264_t x264_t;
+
+/****************************************************************************
+ * Initialisation structure and function.
+ ****************************************************************************/
+/* CPU flags
+ */
+#define X264_CPU_MMX        0x000001    /* mmx */
+#define X264_CPU_MMXEXT     0x000002    /* mmx-ext*/
+#define X264_CPU_SSE        0x000004    /* sse */
+#define X264_CPU_SSE2       0x000008    /* sse 2 */
+#define X264_CPU_3DNOW      0x000010    /* 3dnow! */
+#define X264_CPU_3DNOWEXT   0x000020    /* 3dnow! ext */
+#define X264_CPU_ALTIVEC    0x000040    /* altivec */
+
+/* Analyse flags
+ */
+#define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
+#define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
+#define X264_ANALYSE_PSUB8x8    0x0020  /* Analyse p8x4, p4x8, p4x4 */
+
+/* Colorspace type
+ */
+#define X264_CSP_MASK           0x00ff  /* */
+#define X264_CSP_NONE           0x0000  /* Invalid mode     */
+#define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
+#define X264_CSP_I422           0x0002  /* yuv 4:2:2 planar */
+#define X264_CSP_I444           0x0003  /* yuv 4:4:4 planar */
+#define X264_CSP_YV12           0x0004  /* yuv 4:2:0 planar */
+#define X264_CSP_YUYV           0x0005  /* yuv 4:2:2 packed */
+#define X264_CSP_RGB            0x0006  /* rgb 24bits       */
+#define X264_CSP_BGR            0x0007  /* bgr 24bits       */
+#define X264_CSP_BGRA           0x0008  /* bgr 32bits       */
+#define X264_CSP_VFLIP          0x1000  /* */
+
+/* Slice type
+ */
+#define X264_TYPE_AUTO          0x0000  /* Let x264 choose the right type */
+#define X264_TYPE_IDR           0x0001
+#define X264_TYPE_I             0x0002
+#define X264_TYPE_P             0x0003
+#define X264_TYPE_B             0x0004
+
+typedef struct
+{
+    /* CPU flags */
+    unsigned int cpu;
+
+    /* Video Properties */
+    int         i_width;
+    int         i_height;
+    int         i_csp;  /* CSP of encoded bitstream, only i420 supported */
+
+    struct
+    {
+        /* they will be reduced to be 0 < x <= 65535 and prime */
+        int         i_sar_height;
+        int         i_sar_width;
+    } vui;
+
+    float       f_fps;  /* Used for rate control only */
+
+    /* Bitstream parameters */
+    int         i_frame_reference;  /* Maximum number of reference frames */
+    int         i_idrframe; /* every i_idrframe I frame are marked as IDR */
+    int         i_iframe;   /* every i_iframe are intra */
+    int         i_bframe;   /* how many b-frame between 2 references pictures */
+
+    int         b_deblocking_filter;
+    int         i_deblocking_filter_alphac0;    /* [-6, 6] -6 light filter, 6 strong */
+    int         i_deblocking_filter_beta;       /* [-6, 6]  idem */
+
+    int         b_cabac;
+    int         i_cabac_init_idc;
+
+    int         i_qp_constant;  /* 1-51 */
+    int         i_bitrate;      /* not working yet */
+
+    /* Encoder analyser parameters */
+    struct
+    {
+        unsigned int intra;     /* intra flags */
+        unsigned int inter;     /* inter flags */
+    } analyse;
+
+} x264_param_t;
+
+/* x264_param_default:
+ *      fill x264_param_t with default values and do CPU detection */
+void    x264_param_default( x264_param_t * );
+
+/****************************************************************************
+ * Picture structures and functions.
+ ****************************************************************************/
+typedef struct
+{
+    int     i_csp;
+
+    int     i_plane;
+    int     i_stride[4];
+    uint8_t *plane[4];
+} x264_image_t;
+
+typedef struct
+{
+    /* In: force picture type (if not auto) XXX: ignored for now
+     * Out: type of the picture encoded */
+    int     i_type;
+    /* In: force quantizer for > 0 */
+    int     i_qpplus1;
+    /* In: user pts, Out: pts of encoded picture (user)*/
+    int64_t i_pts;
+
+    /* In: raw data */
+    x264_image_t img;
+} x264_picture_t;
+
+/* x264_picture_alloc:
+ *  alloc data for a picture. You must call x264_picture_clean on it. */
+void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
+
+/* x264_picture_clean:
+ *  free associated resource for a x264_picture_t allocated with
+ *  x264_picture_alloc ONLY */
+void x264_picture_clean( x264_picture_t *pic );
+
+/****************************************************************************
+ * NAL structure and functions:
+ ****************************************************************************/
+/* nal */
+enum nal_unit_type_e
+{
+    NAL_UNKNOWN = 0,
+    NAL_SLICE   = 1,
+    NAL_SLICE_DPA   = 2,
+    NAL_SLICE_DPB   = 3,
+    NAL_SLICE_DPC   = 4,
+    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+    NAL_SEI         = 6,    /* ref_idc == 0 */
+    NAL_SPS         = 7,
+    NAL_PPS         = 8
+    /* ref_idc == 0 for 6,9,10,11,12 */
+};
+enum nal_priority_e
+{
+    NAL_PRIORITY_DISPOSABLE = 0,
+    NAL_PRIORITY_LOW        = 1,
+    NAL_PRIORITY_HIGH       = 2,
+    NAL_PRIORITY_HIGHEST    = 3,
+};
+
+typedef struct
+{
+    int i_ref_idc;  /* nal_priority_e */
+    int i_type;     /* nal_unit_type_e */
+
+    /* This data are raw payload */
+    int     i_payload;
+    uint8_t *p_payload;
+} x264_nal_t;
+
+/* x264_nal_encode:
+ *      encode a nal into a buffer, setting the size.
+ *      if b_annexeb then a long synch work is added
+ *      XXX: it currently doesn't check for overflow */
+int x264_nal_encode( void *, int *, int b_annexeb, x264_nal_t *nal );
+
+/* x264_nal_decode:
+ *      decode a buffer nal into a x264_nal_t */
+int x264_nal_decode( x264_nal_t *nal, void *, int );
+
+/****************************************************************************
+ * Encoder functions:
+ ****************************************************************************/
+
+/* x264_encoder_open:
+ *      create a new encoder handler, all parameters from x264_param_t are copied */
+x264_t *x264_encoder_open   ( x264_param_t * );
+/* x264_encoder_headers:
+ *      return the SPS and PPS that will be used for the whole stream */
+int     x264_encoder_headers( x264_t *, x264_nal_t **, int * );
+/* x264_encoder_encode:
+ *      encode one picture */
+int     x264_encoder_encode ( x264_t *, x264_nal_t **, int *, x264_picture_t * );
+/* x264_encoder_close:
+ *      close an encoder handler */
+void    x264_encoder_close  ( x264_t * );
+
+/* XXX: decoder isn't working so no need to export it */
+#if 0
+/****************************************************************************
+ * Decoder functions:
+ ****************************************************************************
+ * XXX: Not yet working so do not try ...
+ ****************************************************************************/
+/* x264_decoder_open:
+ */
+x264_t *x264_decoder_open   ( x264_param_t * );
+/* x264_decoder_decode:
+ */
+int     x264_decoder_decode ( x264_t *, x264_picture_t **, x264_nal_t * );
+/* x264_decoder_close:
+ */
+void    x264_decoder_close  ( x264_t * );
+#endif
+
+/****************************************************************************
+ * Private stuff for internal usage:
+ ****************************************************************************/
+#ifdef __X264__
+#   ifdef _MSC_VER
+#       define inline __inline
+#       define DECLARE_ALIGNED( type, var, n ) __declspec(align(n)) type var
+#   else
+#       define DECLARE_ALIGNED( type, var, n ) type var __attribute__((aligned(n)))
+#   endif
+#endif
+
+#endif
-- 
2.40.0