From: Laurent Aimar <fenrir@videolan.org>
Date: Thu, 3 Jun 2004 19:29:33 +0000 (+0000)
Subject:  * all: re-import of the CVS.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5dc0aae2f900064d1f58579929a2285ab289a436;p=libx264

 * all: re-import of the CVS.


git-svn-id: svn://svn.videolan.org/x264/trunk@1 df754926-b1dd-0310-bc7b-ec298dee348c
---

5dc0aae2f900064d1f58579929a2285ab289a436
diff --git a/.cvsignore b/.cvsignore
new file mode 100644
index 00000000..82e863c8
--- /dev/null
+++ b/.cvsignore
@@ -0,0 +1,3 @@
+.depend
+x264
+checkasm
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000..af06be2a
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,35 @@
+# $Id: AUTHORS,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+# 
+# The format of this file was inspired by the Linux kernel CREDITS file.
+# Authors are listed alphabetically.
+# 
+# The fields are: name (N), email (E), web-address (W), CVS account login (C),
+# PGP key ID and fingerprint (P), description (D), and snail-mail address (S).
+
+N: Laurent Aimar
+E: fenrir AT via.ecp DOT fr
+C: fenrir
+D: Intial import, maintainer
+D: i386 asm (mmx/mmx2)
+S: France
+
+N: Eric Petit
+E: titer AT videolan DOT org
+C: titer
+D: Altivec
+D: BeOS and MacOS X ports.
+S: France
+
+N: Min Chen
+E: chenm001 AT 163 DOT com
+C: chenm001
+D: Win32/VC 6.0 port
+D: gcc asm to nasm conversion
+D: vfw interface
+S: China
+
+N: Justin Clay
+E: justin_clay AT hotmail DOT com
+C: wheatgerm
+D: Inital work on vfw
+S: Nova Scotia, Canada
diff --git a/COPYING b/COPYING
new file mode 100644
index 00000000..d60c31a9
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/Jamfile b/Jamfile
new file mode 100644
index 00000000..3266e836
--- /dev/null
+++ b/Jamfile
@@ -0,0 +1,67 @@
+# $Id: Jamfile,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+#
+
+# Compilers
+CC = gcc ;
+AS = nasm ;
+
+# Flags
+# To profile: -fprofile-arcs
+# Once done : -fbranch-probabilities
+CCFLAGS = -g -Wall -W ;
+
+# Globals defines
+DEFINES = DEBUG __X264__ ;
+
+# Optims
+OPTIM = -O3 -funroll-loops ;
+
+# Headers rep
+HDRS = . core core/i366 decoder encoder ;
+
+SOURCES_C = core/mc.c core/predict.c core/pixel.c core/macroblock.c
+            core/frame.c core/dct.c core/cpu.c core/cabac.c
+            core/common.c core/mdate.c core/csp.c
+            encoder/analyse.c encoder/me.c encoder/ratecontrol.c
+            encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c
+            encoder/encoder.c ;
+
+SOURCES_X86 = core/i386/cpu.asm ;
+SOURCES_MMX = core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c core/i386/dct.asm core/i386/pixel.asm core/i386/mc.asm ;
+
+SOURCES_ALTIVEC = core/ppc/mc.c core/ppc/pixel.c ;
+
+# libx264
+SOURCES_X264 = $(SOURCES_C) ;
+if $(OS) = LINUX
+{
+    DEFINES      += ARCH_X86 HAVE_MMXEXT HAVE_MALLOC_H ;
+    SOURCES_X264 += $(SOURCES_MMX) ;
+    SOURCES_X264 += $(SOURCES_X86) ;
+    ASFLAGS = -f elf ;
+
+    # Don't ask why
+    NOARUPDATE = false ;
+}
+if $(OS) = MACOSX
+{
+    DEFINES      += HAVE_ALTIVEC ;
+    SOURCES_X264 += $(SOURCES_ALTIVEC) ;
+    CCFLAGS      += -faltivec ;
+#    OPTIM        += -falign-loops=16 ;
+}
+Library libx264 : $(SOURCES_X264) ;
+
+# x264
+LINKLIBS += -lm ;
+LinkLibraries x264 : libx264.a ;
+Main x264 : x264.c ;
+
+# checkasm
+LinkLibraries checkasm : libx264.a ;
+Main checkasm : testing/checkasm.c ;
+
+# XXX Do not remove *.o files
+actions quietly updated piecemeal together RmTemps
+{
+}
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..25060f5d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,58 @@
+# Makefile: tuned for i386/MMX system only
+# For ppc append
+#  SRCS: core/ppc/mc.c core/ppc/pixel.c 
+#  Defines: HAVE_ALTIVEC
+#  CFLAGS: -faltivec
+#
+CC=gcc
+CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+
+SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c \
+       x264.c
+
+AS= nasm
+# for linux
+ASFLAGS=-f elf
+# for cygwin
+#ASFLAGS=-f gnuwin32 -DPREFIX
+
+ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm  core/i386/mc.asm
+OBJASM= $(ASMSRC:%.asm=%.o)
+
+OBJS = $(SRCS:%.c=%.o)
+DEP  = depend
+
+default: $(DEP) x264
+
+libx264.a: $(OBJS) $(OBJASM)
+	ar rc libx264.a $(OBJS) $(OBJASM)
+
+x264: libx264.a x264.o
+	$(CC) $(CFLAGS) -o x264 x264.o libx264.a -lm
+
+checkasm: testing/checkasm.c libx264.a
+	$(CC) $(CFLAGS) -o checkasm $< libx264.a -lm
+
+%.o: %.asm
+	$(AS) $(ASFLAGS) -o $@ $<
+
+.depend: $(SRCS) x264.c
+	$(CC) -MM $(CFLAGS) $(SRCS) x264.c 1> .depend
+
+depend: .depend
+ifneq ($(wildcard .depend),)
+include .depend
+endif
+
+clean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
+distclean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
diff --git a/Makefile.cygwin b/Makefile.cygwin
new file mode 100644
index 00000000..b99f8dc4
--- /dev/null
+++ b/Makefile.cygwin
@@ -0,0 +1,52 @@
+# Makefile: tuned for i386/MMX cygwin system only
+#
+CC=gcc
+CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+
+SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c \
+       x264.c
+
+AS= nasm
+#for cygwin
+ASFLAGS=-f win32 -DPREFIX
+
+ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
+OBJASM= $(ASMSRC:%.asm=%.o)
+
+OBJS = $(SRCS:%.c=%.o)
+DEP  = depend
+
+default: $(DEP) x264
+
+libx264.a: $(OBJS) $(OBJASM)
+	ar rc libx264.a $(OBJS) $(OBJASM)
+
+x264: libx264.a x264.o
+	$(CC) $(CFLAGS) -o x264 x264.o libx264.a -lm
+
+checkasm: testing/checkasm.c libx264.a
+	$(CC) $(CFLAGS) -o checkasm $< libx264.a -lm
+
+%.o: %.asm
+	$(AS) $(ASFLAGS) -o $@ $<
+
+.depend: $(SRCS) x264.c
+	$(CC) -MM $(CFLAGS) $(SRCS) x264.c 1> .depend
+
+depend: .depend
+ifneq ($(wildcard .depend),)
+include .depend
+endif
+
+clean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
+distclean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
diff --git a/TODO b/TODO
new file mode 100644
index 00000000..ec1ab53f
--- /dev/null
+++ b/TODO
@@ -0,0 +1,65 @@
+ It is far from complete, anyway :
+
+General:
+--------
+ Encoder:
+ ########
+
+ * CABAC: check if adaptive model is really working. (I didn't see any improvments)
+
+ * Field support : no and I probably won't do it.
+
+ * Slice A/B/C (partion): is there any interest doing it ? (Shouldn't be hard).
+    - extend x264_t
+    - review x264_macroblock_write_cavlc
+
+ * Intra encoding:
+    - in I_4x4 mode, some predict mode aren't tested and not supported :
+    when some pixels  around are unavailble but could  be predicted from
+    others. (see the norm)
+
+ * Inter coding:
+    - D_4x8 D_8x4 and D_4x4 ME P block -> done but too slow.
+    - B_ types.
+    - scene change detection.
+    - long terme ?
+    - ...
+
+ * B frame: B_L0/L1/BI work in cavlc (need more testing).
+    -> need to do all others mb type (first B_DIRECT and B_SKIP)
+    -> cabac
+    -> look at weighted prediction (should give bettter result)
+    -> better analyse algo (as always ;)
+
+ * Speed issue (oprofile is your friend)
+    - mc.c:30% and pixel.c:20% (mc is used by ME)
+    - Motion Estimation -> try better/faster algos.
+    - loop filter
+    - stream writing (bs)
+    - ...
+
+ * Time spent: (test file: 720x576, mmx, mmxext)
+    CAVLC: analyse=73% encode=15% write=4% filter=6%
+    CABAC: analyse=69% encode=16% write=8% filter=5%
+
+ * Limitations:
+    - frame width/height %16 == 0 only.
+
+ * ...
+
+ Decoder:
+ ########
+
+ * Currently decoder/* won't even compile, and anyway is unusable.
+
+ Build:
+ ######
+ * Port gcc inlined asm to nasm file (BUT without any speed loss else
+   that will be rejected).
+
+Coding issue:
+-------------
+ * table : somes are duplicated -> find a solution (easy).
+ * documentations ? (mouaaaarfff ;)
+ * ...
+
diff --git a/build/cygwin/Makefile b/build/cygwin/Makefile
new file mode 100644
index 00000000..d4458cc5
--- /dev/null
+++ b/build/cygwin/Makefile
@@ -0,0 +1,102 @@
+##############################################################################
+#
+# Makefile for lib264.a and x264
+#
+# Author: x264 by Laurent Aimar <fenrir@via.ecp.fr>
+#
+# $Id: Makefile,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+##############################################################################
+
+# Current dir
+DIR_CUR=$(shell pwd)
+
+# Path to src
+DIR_SRC=$(DIR_CUR)/../..
+
+# Sources
+SRC_C= core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c
+
+SRC_ASM= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
+
+# Alias
+RM= rm -rf
+
+##############################################################################
+# CFLAGS
+##############################################################################
+
+# Constants which should not be modified
+# The `mingw-runtime` package is required when building with -mno-cygwin
+CFLAGS += -I$(DIR_SRC)
+CFLAGS += -mno-cygwin
+CFLAGS += -D__X264__ -DARCH_X86 -DHAVE_MMXEXT -D_CYGWIN
+
+# Optional Compiler options
+CFLAGS += -g -Wall -DDEBUG
+CFLAGS += -O3
+CFLAGS += -finline-functions
+CFLAGS += -funroll-loops
+CFLAGS += -ffast-math
+
+
+##############################################################################
+# Compiler flags for linking stage
+##############################################################################
+
+LDFLAGS += -L$(DIR_LIB) -lx264
+
+##############################################################################
+# ASM
+##############################################################################
+AS= nasm
+ASFLAGS= -f gnuwin32 -DPREFIX
+##############################################################################
+# Rules
+##############################################################################
+
+OBJECTS = $(SRC_C:.c=.obj)
+OBJECTS+= $(SRC_ASM:.asm=.obj)
+
+.SUFFIXES: .obj .asm .c
+
+DIR_BUILD= $(DIR_CUR)/bin
+VPATH = $(DIR_SRC):$(DIR_BUILD)
+
+all: libx264.a x264.exe
+
+$(DIR_BUILD):
+	@echo " D: $(DIR_BUILD)"
+	@mkdir -p $(DIR_BUILD)
+
+.asm.obj:
+	@echo " A: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(AS) $(ASFLAGS) -o $(DIR_BUILD)/$@ $<
+
+.c.obj:
+	@echo " C: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(CC) $(CFLAGS) -c -o $(DIR_BUILD)/$@ $<
+
+libx264.a: $(DIR_BUILD) $(OBJECTS)
+	@echo " L: $(@F)"
+	@cd $(DIR_BUILD) && \
+	ar rc libx264.a $(OBJECTS) && \
+	cp -f libx264.a $(DIR_SRC)
+
+x264.exe: $(DIR_BUILD) $(OBJECTS) x264.obj
+	@echo " L: $(@F)"
+	@cd $(DIR_BUILD) && \
+	$(CC) $(CFLAGS) -o x264 x264.obj libx264.a -lm && \
+	cp -f x264.exe $(DIR_SRC)
+
+clean:
+	@echo " Cl: Object files and target lib"
+	@$(RM) $(DIR_BUILD)
+
diff --git a/build/win32/libx264.dsp b/build/win32/libx264.dsp
new file mode 100644
index 00000000..63a4d6a0
--- /dev/null
+++ b/build/win32/libx264.dsp
@@ -0,0 +1,742 @@
+# Microsoft Developer Studio Project File - Name="libx264" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Static Library" 0x0104
+
+CFG=libx264 - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "libx264.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "libx264.mak" CFG="libx264 - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "libx264 - Win32 Release" (based on "Win32 (x86) Static Library")
+!MESSAGE "libx264 - Win32 Debug" (based on "Win32 (x86) Static Library")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "NDEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /D "ARCH_X86" /FD /c
+# SUBTRACT CPP /YX
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo /out:"bin/libx264.lib"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /Zi /Od /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "_DEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /D "ARCH_X86" /FD /GZ /c
+# SUBTRACT CPP /YX
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo /out:"bin/libx264.lib"
+
+!ENDIF 
+
+# Begin Target
+
+# Name "libx264 - Win32 Release"
+# Name "libx264 - Win32 Debug"
+# Begin Group "Enc"
+
+# PROP Default_Filter ".c"
+# Begin Group "enc_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\encoder\analyse.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\macroblock.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\me.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\ratecontrol.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\set.h
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\encoder\analyse.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\cabac.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\cavlc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\encoder.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\me.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\ratecontrol.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\set.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "Core"
+
+# PROP Default_Filter ".c;.h;"
+# Begin Group "core_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\core\bs.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cabac.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\clip1.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\common.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cpu.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\csp.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\dct.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\frame.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\macroblock.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\pixel.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\predict.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\set.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\vlc.h
+# End Source File
+# End Group
+# Begin Group "I386"
+
+# PROP Default_Filter "*.h,*.c,*.asm"
+# Begin Source File
+
+SOURCE=..\..\core\i386\cpu.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\cpu.asm
+InputName=cpu
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\cpu.asm
+InputName=cpu
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE="..\..\core\i386\dct-c.c"
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\dct.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\dct.asm
+InputName=dct
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\dct.asm
+InputName=dct
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\dct.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\pixel.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\pixel.asm
+InputName=pixel
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\pixel.asm
+InputName=pixel
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\pixel.h
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\core\cabac.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\common.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cpu.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\csp.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\dct.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\frame.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mdate.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\pixel.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\predict.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "Dec"
+
+# PROP Default_Filter ".c"
+# Begin Group "dec_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\decoder\macroblock.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\set.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\vlc.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\decoder\decoder.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\set.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\vlc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "extras"
+
+# PROP Default_Filter ".c"
+# Begin Group "extras_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\extras\getopt.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\extras\stdint.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\extras\getopt.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# End Target
+# End Project
diff --git a/build/win32/x264.dsp b/build/win32/x264.dsp
new file mode 100644
index 00000000..9fe8398f
--- /dev/null
+++ b/build/win32/x264.dsp
@@ -0,0 +1,94 @@
+# Microsoft Developer Studio Project File - Name="x264" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=x264 - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "x264.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "x264.mak" CFG="x264 - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "x264 - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "x264 - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "x264 - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "obj/x264_Release"
+# PROP Intermediate_Dir "obj/x264_Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "NDEBUG" /D "_CONSOLE" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"bin/x264.exe"
+
+!ELSEIF  "$(CFG)" == "x264 - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "obj/x264_Debug"
+# PROP Intermediate_Dir "obj/x264_Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /Zi /Od /I "../../core" /I "../../encode" /I "../../decode" /I "../../extras" /I "../.." /D "_DEBUG" /D "_CONSOLE" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"bin/x264.exe" /pdbtype:sept
+
+!ENDIF 
+
+# Begin Target
+
+# Name "x264 - Win32 Release"
+# Name "x264 - Win32 Debug"
+# Begin Source File
+
+SOURCE=..\..\x264.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\x264.h
+# End Source File
+# End Target
+# End Project
diff --git a/build/win32/x264.dsw b/build/win32/x264.dsw
new file mode 100644
index 00000000..8ef22bf7
--- /dev/null
+++ b/build/win32/x264.dsw
@@ -0,0 +1,44 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "libx264"=.\libx264.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Project: "x264"=.\x264.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name libx264
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/core/bs.h b/core/bs.h
new file mode 100644
index 00000000..380799f2
--- /dev/null
+++ b/core/bs.h
@@ -0,0 +1,423 @@
+/*****************************************************************************
+ * bs.h :
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: bs.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifdef _BS_H
+#warning FIXME Multiple inclusion of bs.h
+#else
+#define _BS_H
+
+typedef struct bs_s
+{
+    uint8_t *p_start;
+    uint8_t *p;
+    uint8_t *p_end;
+
+    int     i_left;    /* i_count number of available bits */
+} bs_t;
+
+static inline void bs_init( bs_t *s, void *p_data, int i_data )
+{
+    s->p_start = p_data;
+    s->p       = p_data;
+    s->p_end   = s->p + i_data;
+    s->i_left  = 8;
+}
+static inline int bs_pos( bs_t *s )
+{
+    return( 8 * ( s->p - s->p_start ) + 8 - s->i_left );
+}
+static inline int bs_eof( bs_t *s )
+{
+    return( s->p >= s->p_end ? 1: 0 );
+}
+static inline uint32_t bs_read( bs_t *s, int i_count )
+{
+     static uint32_t i_mask[33] ={0x00,
+                                  0x01,      0x03,      0x07,      0x0f,
+                                  0x1f,      0x3f,      0x7f,      0xff,
+                                  0x1ff,     0x3ff,     0x7ff,     0xfff,
+                                  0x1fff,    0x3fff,    0x7fff,    0xffff,
+                                  0x1ffff,   0x3ffff,   0x7ffff,   0xfffff,
+                                  0x1fffff,  0x3fffff,  0x7fffff,  0xffffff,
+                                  0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff,
+                                  0x1fffffff,0x3fffffff,0x7fffffff,0xffffffff};
+    int      i_shr;
+    uint32_t i_result = 0;
+
+    while( i_count > 0 )
+    {
+        if( s->p >= s->p_end )
+        {
+            break;
+        }
+
+        if( ( i_shr = s->i_left - i_count ) >= 0 )
+        {
+            /* more in the buffer than requested */
+            i_result |= ( *s->p >> i_shr )&i_mask[i_count];
+            s->i_left -= i_count;
+            if( s->i_left == 0 )
+            {
+                s->p++;
+                s->i_left = 8;
+            }
+            return( i_result );
+        }
+        else
+        {
+            /* less in the buffer than requested */
+           i_result |= (*s->p&i_mask[s->i_left]) << -i_shr;
+           i_count  -= s->i_left;
+           s->p++;
+           s->i_left = 8;
+        }
+    }
+
+    return( i_result );
+}
+
+#if 0
+/* Only > i386 */
+static uint32_t bswap32( uint32_t x )
+{
+    asm( "bswap   %0": "=r" (x):"0" (x));
+    return x;
+}
+/* work only for i_count <= 32 - 7 */
+static inline uint32_t bs_read( bs_t *s, int i_count )
+{
+    if( s->p < s->p_end && i_count > 0 )
+    {
+#if 0
+        uint32_t i_cache = ((s->p[0] << 24)+(s->p[1] << 16)+(s->p[2] << 8)+s->p[3]) << (8-s->i_left);
+#else
+        uint32_t i_cache = bswap32( *((uint32_t*)s->p) ) << (8-s->i_left);
+#endif
+        uint32_t i_ret = i_cache >> ( 32 - i_count);
+
+        s->i_left -= i_count;
+#if 0
+        if( s->i_left <= 0 )
+        {
+            int i_skip = (8-s->i_left) >> 3;
+
+            s->p += i_skip;
+
+            s->i_left += i_skip << 3;
+        }
+#else
+        while( s->i_left <= 0 )
+        {
+            s->p++;
+            s->i_left += 8;
+        }
+#endif
+        return i_ret;
+    }
+    return 0;
+}
+
+#endif
+static inline uint32_t bs_read1( bs_t *s )
+{
+
+    if( s->p < s->p_end )
+    {
+        unsigned int i_result;
+
+        s->i_left--;
+        i_result = ( *s->p >> s->i_left )&0x01;
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+        return i_result;
+    }
+
+    return 0;
+}
+static inline uint32_t bs_show( bs_t *s, int i_count )
+{
+#if 0
+    bs_t     s_tmp = *s;
+    return bs_read( &s_tmp, i_count );
+#else
+    if( s->p < s->p_end && i_count > 0 )
+    {
+        uint32_t i_cache = ((s->p[0] << 24)+(s->p[1] << 16)+(s->p[2] << 8)+s->p[3]) << (8-s->i_left);
+        return( i_cache >> ( 32 - i_count) );
+    }
+    return 0;
+#endif
+}
+
+/* TODO optimize */
+static inline void bs_skip( bs_t *s, int i_count )
+{
+    s->i_left -= i_count;
+
+    while( s->i_left <= 0 )
+    {
+        s->p++;
+        s->i_left += 8;
+    }
+}
+
+
+static inline int bs_read_ue( bs_t *s )
+{
+    int i = 0;
+
+    while( bs_read1( s ) == 0 && s->p < s->p_end && i < 32 )
+    {
+        i++;
+    }
+    return( ( 1 << i) - 1 + bs_read( s, i ) );
+}
+static inline int bs_read_se( bs_t *s )
+{
+    int val = bs_read_ue( s );
+
+    return val&0x01 ? (val+1)/2 : -(val/2);
+}
+
+static inline int bs_read_te( bs_t *s, int x )
+{
+    if( x == 1 )
+    {
+        return 1 - bs_read1( s );
+    }
+    else if( x > 1 )
+    {
+        return bs_read_ue( s );
+    }
+    return 0;
+}
+
+/* TODO optimize (write x bits at once) */
+static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
+{
+    while( i_count > 0 )
+    {
+        if( s->p >= s->p_end )
+        {
+            break;
+        }
+
+        i_count--;
+
+        if( ( i_bits >> i_count )&0x01 )
+        {
+            *s->p |= 1 << ( s->i_left - 1 );
+        }
+        else
+        {
+            *s->p &= ~( 1 << ( s->i_left - 1 ) );
+        }
+        s->i_left--;
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+    }
+}
+
+static inline void bs_write1( bs_t *s, uint32_t i_bits )
+{
+    if( s->p < s->p_end )
+    {
+        s->i_left--;
+
+        if( i_bits&0x01 )
+        {
+            *s->p |= 1 << s->i_left;
+        }
+        else
+        {
+            *s->p &= ~( 1 << s->i_left );
+        }
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+    }
+}
+
+static inline void bs_align( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        s->i_left = 8;
+        s->p++;
+    }
+}
+static inline void bs_align_0( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 0 );
+    }
+}
+static inline void bs_align_1( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 1 );
+    }
+}
+
+
+
+/* golomb functions */
+
+static inline void bs_write_ue( bs_t *s, unsigned int val )
+{
+    int i_size = 0;
+    static const int i_size0_255[256] =
+    {
+        1,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+    };
+
+    if( val == 0 )
+    {
+        bs_write( s, 1, 1 );
+    }
+    else
+    {
+        unsigned int tmp = ++val;
+
+        if( tmp >= 0x00010000 )
+        {
+            i_size += 16;
+            tmp >>= 16;
+        }
+        if( tmp >= 0x100 )
+        {
+            i_size += 8;
+            tmp >>= 8;
+        }
+        i_size += i_size0_255[tmp];
+
+        bs_write( s, 2 * i_size - 1, val );
+    }
+}
+
+static inline void bs_write_se( bs_t *s, int val )
+{
+    bs_write_ue( s, val <= 0 ? -val * 2 : val * 2 - 1);
+}
+
+static inline void bs_write_te( bs_t *s, int x, int val )
+{
+    if( x == 1 )
+    {
+        bs_write( s, 1, ~val );
+    }
+    else if( x > 1 )
+    {
+        bs_write_ue( s, val );
+    }
+}
+
+static inline void bs_rbsp_trailing( bs_t *s )
+{
+    bs_write( s, 1, 1 );
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 0x00 );
+    }
+}
+
+static inline int bs_size_ue( unsigned int val )
+{
+    static const int i_size0_254[255] =
+    {
+        1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
+        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+        11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
+        11,11,11,11,11,11,11,11,11,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+    };
+
+    if( val < 255 )
+    {
+        return i_size0_254[val];
+    }
+    else
+    {
+        int i_size = 0;
+
+        val++;
+
+        if( val >= 0x10000 )
+        {
+            i_size += 32;
+            val = (val >> 16) - 1;
+        }
+        if( val >= 0x100 )
+        {
+            i_size += 16;
+            val = (val >> 8) - 1;
+        }
+        return i_size0_254[val] + i_size;
+    }
+}
+
+static inline int bs_size_se( int val )
+{
+    return bs_size_ue( val <= 0 ? -val * 2 : val * 2 - 1);
+}
+
+static inline int bs_size_te( int x, int val )
+{
+    if( x == 1 )
+    {
+        return 1;
+    }
+    else if( x > 1 )
+    {
+        return bs_size_ue( val );
+    }
+    return 0;
+}
+
+
+
+#endif
diff --git a/core/cabac.c b/core/cabac.c
new file mode 100644
index 00000000..51a06e2c
--- /dev/null
+++ b/core/cabac.c
@@ -0,0 +1,1044 @@
+/*****************************************************************************
+ * cabac.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cabac.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "common.h"
+
+//#define TRACE 1
+
+/* Debugging purpose ONLY !!!! */
+#ifdef TRACE
+static int binCount = 0;
+#endif
+
+
+static const int x264_cabac_context_init_I[399][2] =
+{
+    /* 0 - 10 */
+    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
+    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
+    { -6,  53 }, { -1, 54 },  {  7,  51 },
+
+    /* 11 - 23 unsused for I */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },
+
+    /* 24- 39 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+
+    /* 40 - 53 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 54 - 59 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 60 - 69 */
+    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+    { 13, 41 },  { 3, 62 },
+
+    /* 70 -> 87 */
+    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
+    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
+    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
+    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
+    { -12, 115 },{ -16, 122 },
+
+    /* 88 -> 104 */
+    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
+    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
+    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
+    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
+    { -22, 125 },
+
+    /* 105 -> 135 */
+    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
+    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
+    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
+    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
+    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
+    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
+    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
+    { 14, 62 },  { -13, 108 },{ -15, 100 },
+
+    /* 136 -> 165 */
+    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
+    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
+    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
+    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
+    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
+    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
+    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
+    { 0, 62 },   { 12, 72 },
+
+    /* 166 -> 196 */
+    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
+    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
+    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
+    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
+    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
+    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
+    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
+    { 0, 89 },   { 26, -19 }, { 22, -17 },
+
+    /* 197 -> 226 */
+    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
+    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
+    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
+    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
+    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
+    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
+    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
+    { 12, 68 },  { 2, 97 },
+
+    /* 227 -> 251 */
+    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
+    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
+    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
+    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
+    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
+    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
+    { -4, 65 },
+
+    /* 252 -> 275 */
+    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
+    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
+    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
+    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
+    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
+    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
+
+    /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
+    { 0, 0 },
+
+    /* 277 -> 307 */
+    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
+    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
+    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
+    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
+    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
+    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
+    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
+    { 9, 64 },   { -12, 104 },{ -11, 97 },
+
+    /* 308 -> 337 */
+    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
+    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
+    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
+    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
+    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
+    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
+    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
+    { 5, 64 },   { 12, 70 },
+
+    /* 338 -> 368 */
+    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
+    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
+    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
+    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
+    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
+    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
+    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
+    { -12, 109 },{ 36, -35 }, { 36, -34 },
+
+    /* 369 -> 398 */
+    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
+    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
+    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
+    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
+    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
+    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
+    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
+    { 29, 39 },  { 19, 66 }
+};
+
+static const int x264_cabac_context_init_PB[3][399][2] =
+{
+    /* i_cabac_init_idc == 0 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
+        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
+        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
+        {  17,  50 },
+
+        /* 24 - 39 */
+        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
+        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
+        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
+        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
+
+        /* 40 - 53 */
+        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
+        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
+        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
+        {  -3,  81 }, {   0,  88 },
+
+        /* 54 - 59 */
+        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
+        {  -7,  72 }, {   1,  58 },
+
+        /* 60 - 69 */
+        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
+        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
+        {  13,  41 }, {   3,  62 },
+
+        /* 70 - 87 */
+        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
+        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
+        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
+        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
+        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
+        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
+        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
+        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
+        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
+
+        /* 105 -> 165 */
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
+        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
+        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
+        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
+        {   9,  69 },
+
+        /* 166 - 226 */
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
+        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
+        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
+        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
+        {  -9, 108 },
+
+        /* 227 - 275 */
+        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
+        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
+        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
+        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
+        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
+        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
+        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
+        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
+        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
+        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
+        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
+        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
+        {  -8,  85 },
+
+        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
+        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
+        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
+        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
+        {  26,  43 },
+
+        /* 338 - 398 */
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
+        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
+        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
+        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
+        {  11,  86 },
+
+
+    },
+
+    /* i_cabac_init_idc == 1 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
+        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
+        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
+        {  10,  54 },
+
+        /* 24 - 39 */
+        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
+        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
+        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
+        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
+
+        /* 40 - 53 */
+        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
+        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
+        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
+        {  -7,  86 },{  -5,  95 },
+
+        /* 54 - 59 */
+        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
+        {  -5,  72 },{   0,  61 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
+        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
+        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
+        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
+        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
+        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
+        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
+        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
+        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
+
+        /* 105 -> 165 */
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
+        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
+        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
+        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
+        {   0,  89 },
+
+        /* 166 - 226 */
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
+        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
+        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
+        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
+        { -10, 116 },
+
+        /* 227 - 275 */
+        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
+        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
+        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
+        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
+        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
+        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
+        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
+        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
+        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
+        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
+        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
+        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
+        {  -4,  78 },
+
+        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
+        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
+        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
+        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
+        {  18,  50 },
+
+        /* 338 - 398 */
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
+        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
+        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
+        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
+        {  11,  83 },
+
+    },
+
+    /* i_cabac_init_idc == 2 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
+        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
+        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
+        {  14,  57 },
+
+        /* 24 - 39 */
+        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
+        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
+        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
+        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
+
+        /* 40 - 53 */
+        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
+        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
+        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
+        {  -3,  90 },{  -1,  101 },
+
+        /* 54 - 59 */
+        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
+        {  -7,  50 },{   1,  60 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
+        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
+        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
+        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
+        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
+        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
+        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
+        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
+        {   3,  68 }, {  -8,  71 }, { -13,  98 },
+
+        /* 105 -> 165 */
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
+        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
+        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
+        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
+        { -22, 127 },
+
+        /* 166 - 226 */
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
+        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
+        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
+        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
+        { -24, 127 },
+
+        /* 227 - 275 */
+        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
+        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
+        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
+        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
+        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
+        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
+        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
+        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
+        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
+        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
+        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
+        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
+        { -10,  87 },
+
+        /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
+        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
+        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
+        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
+        {  25,  42 },
+
+        /* 338 - 398 */
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
+        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
+        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
+        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
+        {  25,  61 },
+    }
+};
+
+static const int x264_cabac_range_lps[64][4] =
+{
+    { 128, 176, 208, 240 }, { 128, 167, 197, 227 }, { 128, 158, 187, 216 },
+    { 123, 150, 178, 205 }, { 116, 142, 169, 195 }, { 111, 135, 160, 185 },
+    { 105, 128, 152, 175 }, { 100, 122, 144, 166 }, {  95, 116, 137, 158 },
+    {  90, 110, 130, 150 }, {  85, 104, 123, 142 }, {  81,  99, 117, 135 },
+    {  77,  94, 111, 128 }, {  73,  89, 105, 122 }, {  69,  85, 100, 116 },
+    {  66,  80,  95, 110 }, {  62,  76,  90, 104 }, {  59,  72,  86,  99 },
+    {  56,  69,  81,  94 }, {  53,  65,  77,  89 }, {  51,  62,  73,  85 },
+    {  48,  59,  69,  80 }, {  46,  56,  66,  76 }, {  43,  53,  63,  72 },
+    {  41,  50,  59,  69 }, {  39,  48,  56,  65 }, {  37,  45,  54,  62 },
+    {  35,  43,  51,  59 }, {  33,  41,  48,  56 }, {  32,  39,  46,  53 },
+    {  30,  37,  43,  50 }, {  29,  35,  41,  48 }, {  27,  33,  39,  45 },
+    {  26,  31,  37,  43 }, {  24,  30,  35,  41 }, {  23,  28,  33,  39 },
+    {  22,  27,  32,  37 }, {  21,  26,  30,  35 }, {  20,  24,  29,  33 },
+    {  19,  23,  27,  31 }, {  18,  22,  26,  30 }, {  17,  21,  25,  28 },
+    {  16,  20,  23,  27 }, {  15,  19,  22,  25 }, {  14,  18,  21,  24 },
+    {  14,  17,  20,  23 }, {  13,  16,  19,  22 }, {  12,  15,  18,  21 },
+    {  12,  14,  17,  20 }, {  11,  14,  16,  19 }, {  11,  13,  15,  18 },
+    {  10,  12,  15,  17 }, {  10,  12,  14,  16 }, {   9,  11,  13,  15 },
+    {   9,  11,  12,  14 }, {   8,  10,  12,  14 }, {   8,   9,  11,  13 },
+    {   7,   9,  11,  12 }, {   7,   9,  10,  12 }, {   7,   8,  10,  11 },
+    {   6,   8,   9,  11 }, {   6,   7,   9,  10 }, {   6,   7,   8,   9 },
+    {   2,   2,   2,   2 },
+};
+
+static const int x264_transition_lps[64] =
+{
+     0, 0, 1, 2, 2, 4, 4, 5, 6, 7, 8, 9, 9,11,11,12,
+    13,13,15,15,16,16,18,18,19,19,21,21,22,22,23,24,
+    24,25,26,26,27,27,28,29,29,30,30,30,31,32,32,33,
+    33,33,34,34,35,35,35,36,36,36,37,37,37,38,38,63
+};
+static const int x264_transition_mps[64] =
+{
+     1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,
+    17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,
+    33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,
+    49,50,51,52,53,54,55,56,57,58,59,60,61,62,62,63,
+};
+
+#define FIX8(f) ((int)(f*(1<<8)))
+static int x264_cabac_probability[128] =
+{
+    FIX8(0.9812), FIX8(0.9802), FIX8(0.9792), FIX8(0.9781),
+    FIX8(0.9769), FIX8(0.9757), FIX8(0.9744), FIX8(0.9730),
+    FIX8(0.9716), FIX8(0.9700), FIX8(0.9684), FIX8(0.9667),
+    FIX8(0.9650), FIX8(0.9631), FIX8(0.9611), FIX8(0.9590),
+    FIX8(0.9568), FIX8(0.9545), FIX8(0.9521), FIX8(0.9495),
+    FIX8(0.9468), FIX8(0.9440), FIX8(0.9410), FIX8(0.9378),
+    FIX8(0.9345), FIX8(0.9310), FIX8(0.9273), FIX8(0.9234),
+    FIX8(0.9193), FIX8(0.9150), FIX8(0.9105), FIX8(0.9057),
+    FIX8(0.9006), FIX8(0.8953), FIX8(0.8897), FIX8(0.8838),
+    FIX8(0.8776), FIX8(0.8710), FIX8(0.8641), FIX8(0.8569),
+    FIX8(0.8492), FIX8(0.8411), FIX8(0.8326), FIX8(0.8237),
+    FIX8(0.8143), FIX8(0.8043), FIX8(0.7938), FIX8(0.7828),
+    FIX8(0.7712), FIX8(0.7590), FIX8(0.7461), FIX8(0.7325),
+    FIX8(0.7182), FIX8(0.7031), FIX8(0.6872), FIX8(0.6705),
+    FIX8(0.6528), FIX8(0.6343), FIX8(0.6147), FIX8(0.5941),
+    FIX8(0.5724), FIX8(0.5495), FIX8(0.5254), FIX8(0.5000),
+    FIX8(0.5000), FIX8(0.4746), FIX8(0.4505), FIX8(0.4276),
+    FIX8(0.4059), FIX8(0.3853), FIX8(0.3657), FIX8(0.3472),
+    FIX8(0.3295), FIX8(0.3128), FIX8(0.2969), FIX8(0.2818),
+    FIX8(0.2675), FIX8(0.2539), FIX8(0.2410), FIX8(0.2288),
+    FIX8(0.2172), FIX8(0.2062), FIX8(0.1957), FIX8(0.1857),
+    FIX8(0.1763), FIX8(0.1674), FIX8(0.1589), FIX8(0.1508),
+    FIX8(0.1431), FIX8(0.1359), FIX8(0.1290), FIX8(0.1224),
+    FIX8(0.1162), FIX8(0.1103), FIX8(0.1047), FIX8(0.0994),
+    FIX8(0.0943), FIX8(0.0895), FIX8(0.0850), FIX8(0.0807),
+    FIX8(0.0766), FIX8(0.0727), FIX8(0.0690), FIX8(0.0655),
+    FIX8(0.0622), FIX8(0.0590), FIX8(0.0560), FIX8(0.0532),
+    FIX8(0.0505), FIX8(0.0479), FIX8(0.0455), FIX8(0.0432),
+    FIX8(0.0410), FIX8(0.0389), FIX8(0.0369), FIX8(0.0350),
+    FIX8(0.0333), FIX8(0.0316), FIX8(0.0300), FIX8(0.0284),
+    FIX8(0.0270), FIX8(0.0256), FIX8(0.0243), FIX8(0.0231),
+    FIX8(0.0219), FIX8(0.0208), FIX8(0.0198), FIX8(0.0187)
+};
+/* -ln2(probability) */
+static int x264_cabac_entropy[128] =
+{
+    FIX8(0.0273), FIX8(0.0288), FIX8(0.0303), FIX8(0.0320),
+    FIX8(0.0337), FIX8(0.0355), FIX8(0.0375), FIX8(0.0395),
+    FIX8(0.0416), FIX8(0.0439), FIX8(0.0463), FIX8(0.0488),
+    FIX8(0.0515), FIX8(0.0543), FIX8(0.0572), FIX8(0.0604),
+    FIX8(0.0637), FIX8(0.0671), FIX8(0.0708), FIX8(0.0747),
+    FIX8(0.0788), FIX8(0.0832), FIX8(0.0878), FIX8(0.0926),
+    FIX8(0.0977), FIX8(0.1032), FIX8(0.1089), FIX8(0.1149),
+    FIX8(0.1214), FIX8(0.1282), FIX8(0.1353), FIX8(0.1429),
+    FIX8(0.1510), FIX8(0.1596), FIX8(0.1686), FIX8(0.1782),
+    FIX8(0.1884), FIX8(0.1992), FIX8(0.2107), FIX8(0.2229),
+    FIX8(0.2358), FIX8(0.2496), FIX8(0.2642), FIX8(0.2798),
+    FIX8(0.2964), FIX8(0.3142), FIX8(0.3331), FIX8(0.3532),
+    FIX8(0.3748), FIX8(0.3979), FIX8(0.4226), FIX8(0.4491),
+    FIX8(0.4776), FIX8(0.5082), FIX8(0.5412), FIX8(0.5768),
+    FIX8(0.6152), FIX8(0.6568), FIX8(0.7020), FIX8(0.7513),
+    FIX8(0.8050), FIX8(0.8638), FIX8(0.9285), FIX8(1.0000),
+    FIX8(1.0000), FIX8(1.0752), FIX8(1.1504), FIX8(1.2256),
+    FIX8(1.3008), FIX8(1.3759), FIX8(1.4511), FIX8(1.5263),
+    FIX8(1.6015), FIX8(1.6767), FIX8(1.7519), FIX8(1.8271),
+    FIX8(1.9023), FIX8(1.9775), FIX8(2.0527), FIX8(2.1278),
+    FIX8(2.2030), FIX8(2.2782), FIX8(2.3534), FIX8(2.4286),
+    FIX8(2.5038), FIX8(2.5790), FIX8(2.6542), FIX8(2.7294),
+    FIX8(2.8046), FIX8(2.8797), FIX8(2.9549), FIX8(3.0301),
+    FIX8(3.1053), FIX8(3.1805), FIX8(3.2557), FIX8(3.3309),
+    FIX8(3.4061), FIX8(3.4813), FIX8(3.5565), FIX8(3.6316),
+    FIX8(3.7068), FIX8(3.7820), FIX8(3.8572), FIX8(3.9324),
+    FIX8(4.0076), FIX8(4.0828), FIX8(4.1580), FIX8(4.2332),
+    FIX8(4.3083), FIX8(4.3836), FIX8(4.4588), FIX8(4.5339),
+    FIX8(4.6091), FIX8(4.6843), FIX8(4.7595), FIX8(4.8347),
+    FIX8(4.9099), FIX8(4.9851), FIX8(5.0602), FIX8(5.1354),
+    FIX8(5.2106), FIX8(5.2859), FIX8(5.3610), FIX8(5.4362),
+    FIX8(5.5114), FIX8(5.5866), FIX8(5.6618), FIX8(5.7370)
+};
+
+#undef FIX8
+
+
+/*****************************************************************************
+ *
+ *****************************************************************************/
+void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
+{
+    const int (*cabac_context_init)[399][2];
+    int i;
+
+    if( i_slice_type == SLICE_TYPE_I )
+    {
+        cabac_context_init = &x264_cabac_context_init_I;
+    }
+    else
+    {
+        cabac_context_init = &x264_cabac_context_init_PB[i_model];
+    }
+
+    for( i = 0; i < 399; i++ )
+    {
+        int i_pre_state;
+
+        i_pre_state = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 );
+        if( i_pre_state <= 63 )
+        {
+            cb->ctxstate[i].i_state = 63 - i_pre_state;
+            cb->ctxstate[i].i_mps = 0;
+        }
+        else
+        {
+            cb->ctxstate[i].i_state = i_pre_state - 64;
+            cb->ctxstate[i].i_mps = 1;
+        }
+        cb->ctxstate[i].i_count = 0;
+    }
+}
+
+/*****************************************************************************
+ *
+ *****************************************************************************/
+void x264_cabac_decode_init( x264_cabac_t *cb, bs_t *s )
+{
+    cb->i_range = 0x01fe;
+    cb->i_low   = bs_read( s, 9 );
+    cb->s       = s;
+}
+
+static inline void x264_cabac_decode_renorm( x264_cabac_t *cb )
+{
+    while( cb->i_range < 0x0100 )
+    {
+        cb->i_range <<= 1;
+        cb->i_low   = ( cb->i_low << 1 )|bs_read( cb->s, 1 );
+    }
+}
+
+int  x264_cabac_decode_decision( x264_cabac_t *cb, int i_ctx )
+{
+    int i_state = cb->ctxstate[i_ctx].i_state;
+    int i_mps   = cb->ctxstate[i_ctx].i_mps;
+
+    int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)&0x03];
+
+    int val;
+
+    cb->i_range -= i_range_lps;
+
+    if( cb->i_low >= cb->i_range )
+    {
+        val = 1 - i_mps;
+
+        cb->i_low -= cb->i_range;
+        cb->i_range= i_range_lps;
+
+        if( i_state == 0 )
+        {
+            cb->ctxstate[i_ctx].i_mps = 1 - i_mps;
+        }
+        cb->ctxstate[i_ctx].i_state = x264_transition_lps[i_state];
+    }
+    else
+    {
+        val = i_mps;
+
+        cb->ctxstate[i_ctx].i_state = x264_transition_mps[i_state];
+    }
+
+    x264_cabac_decode_renorm( cb );
+
+    return val;
+}
+int  x264_cabac_decode_bypass( x264_cabac_t *cb )
+{
+    cb->i_low = (cb->i_low << 1)|bs_read( cb->s, 1 );
+
+    if( cb->i_low >= cb->i_range )
+    {
+        cb->i_low -= cb->i_range;
+        return 1;
+    }
+    return 0;
+}
+int  x264_cabac_decode_terminal( x264_cabac_t *cb )
+{
+    if( cb->i_low >= cb->i_range - 2 )
+    {
+        return 1;
+    }
+
+    cb->i_range -= 2;
+    x264_cabac_decode_renorm( cb );
+    return 0;
+}
+
+
+/*****************************************************************************
+ *
+ *****************************************************************************/
+void x264_cabac_model_init( x264_cabac_t *cb )
+{
+    int i;
+
+    for( i = 0; i < 3; i++ )
+    {
+        cb->slice[i].i_model = 0;
+        cb->slice[i].i_cost  = -1;
+    }
+}
+
+int  x264_cabac_model_get ( x264_cabac_t *cb, int i_slice_type )
+{
+    return cb->slice[i_slice_type].i_model;
+}
+
+void x264_cabac_model_update( x264_cabac_t *cb, int i_slice_type, int i_qp )
+{
+    int i;
+
+    if( i_slice_type == SLICE_TYPE_I )
+    {
+        return;
+    }
+    cb->slice[i_slice_type].i_cost = -1;
+
+    for( i = 0; i < 3; i++ )
+    {
+        int i_ctx;
+        int i_cost;
+
+        i_cost = 0; /* fix8 */
+
+        for( i_ctx = 0; i_ctx < 399; i_ctx++ )
+        {
+            int i_weight;
+            int i_model_state;
+            int i_ctx_state;
+
+            i_weight = X264_MIN( (1<<8), (cb->ctxstate[i_ctx].i_count<<8) / 32 );
+            i_model_state = x264_clip3( ((x264_cabac_context_init_PB[i][i_ctx][0] * i_qp)>>4) +
+                                          x264_cabac_context_init_PB[i][i_ctx][1], 0, 127 );
+            i_ctx_state   = cb->ctxstate[i_ctx].i_mps ? 64 + cb->ctxstate[i_ctx].i_state : 63 - cb->ctxstate[i_ctx].i_state;
+
+            i_cost += (i_weight * (( x264_cabac_probability[      i_ctx_state] * x264_cabac_entropy[      i_model_state] +
+                                     x264_cabac_probability[127 - i_ctx_state] * x264_cabac_entropy[127 - i_model_state] ) >> 8))>>8;
+        }
+
+        if( cb->slice[i_slice_type].i_cost == -1 || cb->slice[i_slice_type].i_cost > i_cost )
+        {
+            cb->slice[i_slice_type].i_model= i;
+            cb->slice[i_slice_type].i_cost = i_cost;
+        }
+    }
+}
+
+void x264_cabac_encode_init( x264_cabac_t *cb, bs_t *s )
+{
+    cb->i_low   = 0;
+    cb->i_range = 0x01FE;
+    cb->b_first_bit= 1;
+    cb->i_bits_outstanding = 0;
+    cb->i_sym_cnt = 0;
+    cb->s = s;
+}
+
+static inline void x264_cabac_putbit( x264_cabac_t *cb, int b )
+{
+    if( cb->b_first_bit )
+    {
+        cb->b_first_bit = 0;
+    }
+    else
+    {
+        bs_write1( cb->s, b );
+    }
+
+    while( cb->i_bits_outstanding > 0 )
+    {
+        bs_write1( cb->s, 1 - b );
+        cb->i_bits_outstanding--;
+    }
+}
+
+static inline void x264_cabac_encode_renorm( x264_cabac_t *cb )
+{
+    /* RenormE */
+    while( cb->i_range < 0x0100 )
+    {
+        if( cb->i_low < 0x100 )
+        {
+            x264_cabac_putbit( cb, 0 );
+        }
+        else
+        {
+            if( cb->i_low >= 0x200 )
+            {
+                cb->i_low -= 0x200;
+                x264_cabac_putbit( cb, 1 );
+            }
+            else
+            {
+                cb->i_low -= 0x100;
+                cb->i_bits_outstanding++;
+            }
+        }
+
+        cb->i_range <<= 1;
+        cb->i_low   <<= 1;
+    }
+}
+
+void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b )
+{
+    int i_state = cb->ctxstate[i_ctx].i_state;
+    int i_mps   = cb->ctxstate[i_ctx].i_mps;
+
+    int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)&0x03];
+
+#ifdef TRACE
+    if( binCount >= 0 )
+    {
+        fprintf( stderr, "%d  ctx=%d b=%d\n", binCount, i_ctx, b );
+    }
+    fprintf( stderr, "%d  0x%04x  %d  %d\n", binCount++, cb->i_range, i_state, i_mps );
+#endif
+
+    cb->i_range -= i_range_lps;
+
+    if( b != i_mps )
+    {
+        cb->i_low += cb->i_range;
+        cb->i_range = i_range_lps;
+
+        if( i_state == 0 )
+        {
+            cb->ctxstate[i_ctx].i_mps = 1 - i_mps;
+        }
+        cb->ctxstate[i_ctx].i_state = x264_transition_lps[i_state];
+    }
+    else
+    {
+        cb->ctxstate[i_ctx].i_state = x264_transition_mps[i_state];
+    }
+    cb->ctxstate[i_ctx].i_count++;
+
+    x264_cabac_encode_renorm( cb );
+
+    cb->i_sym_cnt++;
+}
+
+void x264_cabac_encode_bypass( x264_cabac_t *cb, int b )
+{
+#ifdef TRACE
+    fprintf( stderr, "%d  0x%04x\n", binCount++, cb->i_range );
+#endif
+
+    cb->i_low <<= 1;
+    if( b )
+    {
+        cb->i_low += cb->i_range;
+    }
+
+    if( cb->i_low >= 0x400 )
+    {
+        x264_cabac_putbit( cb, 1 );
+        cb->i_low -= 0x400;
+    }
+    else
+    {
+        if( cb->i_low < 0x200 )
+        {
+            x264_cabac_putbit( cb, 0 );
+        }
+        else
+        {
+            cb->i_low -= 0x200;
+            cb->i_bits_outstanding++;
+        }
+    }
+    cb->i_sym_cnt++;
+}
+
+void x264_cabac_encode_terminal( x264_cabac_t *cb, int b )
+{
+#ifdef TRACE
+    fprintf( stderr, "%d  0x%04x\n", binCount++, cb->i_range );
+#endif
+
+    cb->i_range -= 2;
+    if( b )
+    {
+        cb->i_low += cb->i_range;
+        cb->i_range = 2;
+    }
+    x264_cabac_encode_renorm( cb );
+
+    cb->i_sym_cnt++;
+}
+
+void x264_cabac_encode_flush( x264_cabac_t *cb )
+{
+    x264_cabac_putbit( cb, (cb->i_low >> 9)&0x01 );
+    bs_write1( cb->s, (cb->i_low >> 8)&0x01 );
+
+    /* check that */
+    bs_write1( cb->s, 0x01 );
+    bs_align_0( cb->s );
+}
+
diff --git a/core/cabac.h b/core/cabac.h
new file mode 100644
index 00000000..3051789d
--- /dev/null
+++ b/core/cabac.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+ * cabac.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cabac.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CABAC_H
+#define _CABAC_H 1
+
+typedef struct
+{
+    /* model */
+    struct
+    {
+        int i_model;
+        int i_cost;
+    } slice[3];
+
+    /* context */
+    struct
+    {
+        int i_state;
+        int i_mps;
+        int i_count;
+    } ctxstate[399];
+
+    /* state */
+    int i_low;
+    int i_range;
+
+    int i_sym_cnt;
+
+    /* bit stream */
+    int b_first_bit;
+    int i_bits_outstanding;
+    bs_t *s;
+
+} x264_cabac_t;
+
+/* encoder/decoder: init the contexts given i_slice_type, the quantif and the model */
+void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
+
+/* decoder only: */
+void x264_cabac_decode_init    ( x264_cabac_t *cb, bs_t *s );
+int  x264_cabac_decode_decision( x264_cabac_t *cb, int i_ctx_idx );
+int  x264_cabac_decode_bypass  ( x264_cabac_t *cb );
+int  x264_cabac_decode_terminal( x264_cabac_t *cb );
+
+/* encoder only: adaptive model init */
+void x264_cabac_model_init( x264_cabac_t *cb );
+int  x264_cabac_model_get ( x264_cabac_t *cb, int i_slice_type );
+void x264_cabac_model_update( x264_cabac_t *cb, int i_slice_type, int i_qp );
+/* encoder only: */
+void x264_cabac_encode_init ( x264_cabac_t *cb, bs_t *s );
+void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx_idx, int b );
+void x264_cabac_encode_bypass( x264_cabac_t *cb, int b );
+void x264_cabac_encode_terminal( x264_cabac_t *cb, int b );
+void x264_cabac_encode_flush( x264_cabac_t *cb );
+
+
+#endif
diff --git a/core/clip1.h b/core/clip1.h
new file mode 100644
index 00000000..0e8adc21
--- /dev/null
+++ b/core/clip1.h
@@ -0,0 +1,71 @@
+/*****************************************************************************
+ * clip1.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: clip1.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CLIP1_H
+#define _CLIP1_H 1
+
+/* Clip1 table
+ * XXX : only for tap filter.
+ *
+ * With tap filter (( 1, -5, 20, 20, -5, 1 ) + 16 )/ 32
+ * -> (-2*5 * 255+16)/32 <= out <= (2*1*255 + 2*20*255+16)/32
+ * -> -80 <= out <= 335
+ * So we need a table of 80+335+1 = 416 entries
+ */
+
+static const uint8_t x264_mc_clip1_table[80+1+335] =
+{
+    /* -80 -> -1 */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,
+    /* 0 -> 255 */
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+    36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+    54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+    72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+    90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,101,102,103,104,105,106,107,
+    108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
+    126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
+    144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
+    162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
+    180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
+    198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
+    216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
+    234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
+    252,253,254,255,
+    /* 256 -> 340 */
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,
+};
+
+static inline uint8_t x264_mc_clip1( int x )
+{
+    return x264_mc_clip1_table[x+80];
+}
+
+#endif
diff --git a/core/common.c b/core/common.c
new file mode 100644
index 00000000..b44d9cd6
--- /dev/null
+++ b/core/common.c
@@ -0,0 +1,300 @@
+/*****************************************************************************
+ * common.c: h264 library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: common.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include "common.h"
+#include "cpu.h"
+
+/****************************************************************************
+ * x264_param_default:
+ ****************************************************************************/
+void    x264_param_default( x264_param_t *param )
+{
+    /* */
+    memset( param, 0, sizeof( x264_param_t ) );
+
+    /* CPU autodetect */
+    param->cpu = x264_cpu_detect();
+    fprintf( stderr, "x264: cpu capabilities: %s%s%s%s%s%s\n",
+             param->cpu&X264_CPU_MMX ? "MMX " : "",
+             param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
+             param->cpu&X264_CPU_SSE ? "SSE " : "",
+             param->cpu&X264_CPU_SSE2 ? "SSE2 " : "",
+             param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
+             param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" );
+
+
+    /* Video properties */
+    param->i_csp           = X264_CSP_I420;
+    param->i_width         = 0;
+    param->i_height        = 0;
+    param->vui.i_sar_width = 0;
+    param->vui.i_sar_height= 0;
+    param->f_fps           = 25.0;
+
+    /* Encoder parameters */
+    param->i_frame_reference = 1;
+    param->i_idrframe = 2;
+    param->i_iframe = 60;
+    param->i_bframe = 0;
+
+    param->b_deblocking_filter = 1;
+    param->i_deblocking_filter_alphac0 = 0;
+    param->i_deblocking_filter_beta = 0;
+
+    param->b_cabac = 0;
+    param->i_cabac_init_idc = -1;
+
+    param->i_bitrate = 3000;
+    param->i_qp_constant = 26;
+
+    param->analyse.intra = X264_ANALYSE_I4x4;
+    param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16;
+}
+
+/****************************************************************************
+ * x264_picture_alloc:
+ ****************************************************************************/
+void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+{
+    pic->i_type = X264_TYPE_AUTO;
+    pic->i_qpplus1 = 0;
+    pic->img.i_csp = i_csp;
+    switch( i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+        case X264_CSP_YV12:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height / 2 );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width / 2;
+            pic->img.i_stride[2] = i_width / 2;
+            break;
+
+        case X264_CSP_I422:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 2 * i_width * i_height );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 2;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width / 2;
+            pic->img.i_stride[2] = i_width / 2;
+            break;
+
+        case X264_CSP_I444:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width;
+            pic->img.i_stride[2] = i_width;
+            break;
+
+        case X264_CSP_YUYV:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 2 * i_width * i_height );
+            pic->img.i_stride[0] = 2 * i_width;
+            break;
+
+        case X264_CSP_RGB:
+        case X264_CSP_BGR:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height );
+            pic->img.i_stride[0] = 3 * i_width;
+            break;
+
+        case X264_CSP_BGRA:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 4 * i_width * i_height );
+            pic->img.i_stride[0] = 4 * i_width;
+            break;
+
+        default:
+            fprintf( stderr, "invalid CSP\n" );
+            pic->img.i_plane = 0;
+            break;
+    }
+}
+
+/****************************************************************************
+ * x264_picture_clean:
+ ****************************************************************************/
+void x264_picture_clean( x264_picture_t *pic )
+{
+    x264_free( pic->img.plane[0] );
+
+    /* just to be safe */
+    memset( pic, 0, sizeof( x264_picture_t ) );
+}
+
+/****************************************************************************
+ * x264_nal_encode:
+ ****************************************************************************/
+int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal )
+{
+    uint8_t *dst = p_data;
+    uint8_t *src = nal->p_payload;
+    uint8_t *end = &nal->p_payload[nal->i_payload];
+
+    int i_count = 0;
+
+    /* FIXME this code doesn't check overflow */
+
+    if( b_annexeb )
+    {
+        /* long nal start code (we always use long ones)*/
+        *dst++ = 0x00;
+        *dst++ = 0x00;
+        *dst++ = 0x00;
+        *dst++ = 0x01;
+    }
+
+    /* nal header */
+    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
+
+    while( src < end )
+    {
+        if( i_count == 2 && *src <= 0x03 )
+        {
+            *dst++ = 0x03;
+            i_count = 0;
+        }
+        if( *src == 0 )
+        {
+            i_count++;
+        }
+        else
+        {
+            i_count = 0;
+        }
+        *dst++ = *src++;
+    }
+    *pi_data = dst - (uint8_t*)p_data;
+
+    return *pi_data;
+}
+
+/****************************************************************************
+ * x264_nal_decode:
+ ****************************************************************************/
+int x264_nal_decode( x264_nal_t *nal, void *p_data, int i_data )
+{
+    uint8_t *src = p_data;
+    uint8_t *end = &src[i_data];
+    uint8_t *dst = nal->p_payload;
+
+    nal->i_type    = src[0]&0x1f;
+    nal->i_ref_idc = (src[0] >> 5)&0x03;
+
+    src++;
+
+    while( src < end )
+    {
+        if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00  && src[2] == 0x03 )
+        {
+            *dst++ = 0x00;
+            *dst++ = 0x00;
+
+            src += 3;
+            continue;
+        }
+        *dst++ = *src++;
+    }
+
+    nal->i_payload = dst - (uint8_t*)p_data;
+    return 0;
+}
+
+
+
+/****************************************************************************
+ * x264_malloc:
+ ****************************************************************************/
+void *x264_malloc( int i_size )
+{
+#ifdef HAVE_MALLOC_H
+    return memalign( 16, i_size );
+#else
+    uint8_t * buf;
+    uint8_t * align_buf;
+    buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
+              sizeof( int ) );
+    align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
+    align_buf -= (long) align_buf & 15;
+    *( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
+    *( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
+    return align_buf;
+#endif
+}
+
+/****************************************************************************
+ * x264_free:
+ ****************************************************************************/
+void x264_free( void *p )
+{
+    if( p )
+    {
+#ifdef HAVE_MALLOC_H
+        free( p );
+#else
+        free( *( ( ( void **) p ) - 1 ) );
+#endif
+    }
+}
+
+/****************************************************************************
+ * x264_realloc:
+ ****************************************************************************/
+void *x264_realloc( void *p, int i_size )
+{
+#ifdef HAVE_MALLOC_H
+    return realloc( p, i_size );
+#else
+    int       i_old_size = 0;
+    uint8_t * p_new;
+    if( p )
+    {
+        i_old_size = *( (int*) ( (uint8_t*) p ) - sizeof( void ** ) -
+                         sizeof( int ) );
+    }
+    p_new = x264_malloc( i_size );
+    if( i_old_size > 0 && i_size > 0 )
+    {
+        memcpy( p_new, p, ( i_old_size < i_size ) ? i_old_size : i_size );
+    }
+    x264_free( p );
+    return p_new;
+#endif
+}
+
diff --git a/core/common.h b/core/common.h
new file mode 100644
index 00000000..e5c85c05
--- /dev/null
+++ b/core/common.h
@@ -0,0 +1,344 @@
+/*****************************************************************************
+ * common.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: common.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _COMMON_H
+#define _COMMON_H 1
+
+#include <stdint.h>
+
+#include "../x264.h"
+#include "bs.h"
+#include "set.h"
+#include "predict.h"
+#include "pixel.h"
+#include "mc.h"
+#include "frame.h"
+#include "dct.h"
+#include "cabac.h"
+#include "csp.h"
+
+#define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) )
+#define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) )
+#define X264_ABS(a)   ( (a)< 0 ? -(a) : (a) )
+
+/* x264_malloc : will do or emulate a memalign
+ * XXX you HAVE TO use x264_free for buffer allocated
+ * with x264_malloc
+ */
+void *x264_malloc( int );
+void *x264_realloc( void *p, int i_size );
+void  x264_free( void * );
+
+/* mdate: return the current date in microsecond */
+int64_t x264_mdate( void );
+
+static inline int x264_clip3( int v, int i_min, int i_max )
+{
+    if( v < i_min )
+    {
+        return i_min;
+    }
+    else if( v > i_max )
+    {
+        return i_max;
+    }
+    else
+    {
+        return v;
+    }
+}
+
+enum slice_type_e
+{
+    SLICE_TYPE_P  = 0,
+    SLICE_TYPE_B  = 1,
+    SLICE_TYPE_I  = 2,
+    SLICE_TYPE_SP = 3,
+    SLICE_TYPE_SI = 4
+};
+
+typedef struct
+{
+    x264_sps_t *sps;
+    x264_pps_t *pps;
+
+    int i_type;
+    int i_first_mb;
+
+    int i_pps_id;
+
+    int i_frame_num;
+
+    int b_field_pic;
+    int b_bottom_field;
+
+    int i_idr_pic_id;   /* -1 if nal_type != 5 */
+
+    int i_poc_lsb;
+    int i_delta_poc_bottom;
+
+    int i_delta_poc[2];
+    int i_redundant_pic_cnt;
+
+    int b_direct_spatial_mv_pred;
+
+    int b_num_ref_idx_override;
+    int i_num_ref_idx_l0_active;
+    int i_num_ref_idx_l1_active;
+
+    int i_cabac_init_idc;
+
+    int i_qp_delta;
+    int b_sp_for_swidth;
+    int i_qs_delta;
+
+    /* deblocking filter */
+    int i_disable_deblocking_filter_idc;
+    int i_alpha_c0_offset;
+    int i_beta_offset;
+
+} x264_slice_header_t;
+
+/* From ffmpeg
+ */
+#define X264_SCAN8_SIZE (6*8)
+#define X264_SCAN8_0 (4+1*8)
+
+static const int x264_scan8[16+2*4] =
+{
+    /* Luma */
+    4+1*8, 5+1*8, 4+2*8, 5+2*8,
+    6+1*8, 7+1*8, 6+2*8, 7+2*8,
+    4+3*8, 5+3*8, 4+4*8, 5+4*8,
+    6+3*8, 7+3*8, 6+4*8, 7+4*8,
+
+    /* Cb */
+    1+1*8, 2+1*8,
+    1+2*8, 2+2*8,
+
+    /* Cr */
+    1+4*8, 2+4*8,
+    1+5*8, 2+5*8,
+};
+/*
+   0 1 2 3 4 5 6 7
+ 0
+ 1   B B   L L L L
+ 2   B B   L L L L
+ 3         L L L L
+ 4   R R   L L L L
+ 5   R R
+*/
+
+#define X264_BFRAME_MAX 16
+
+typedef struct x264_ratecontrol_t   x264_ratecontrol_t;
+typedef struct x264_vlc_table_t     x264_vlc_table_t;
+
+struct x264_t
+{
+    /* encoder parameters */
+    x264_param_t    param;
+
+    /* bitstream output */
+    struct
+    {
+        int         i_nal;
+        x264_nal_t  nal[3];         /* for now 3 is enought */
+        int         i_bitstream;    /* size of p_bitstream */
+        uint8_t     *p_bitstream;   /* will hold data for all nal */
+        bs_t        bs;
+    } out;
+
+    /* frame number/poc */
+    int             i_frame;
+    int             i_poc;
+
+    int             i_frame_offset; /* decoding only */
+    int             i_frame_num;    /* decoding only */
+    int             i_poc_msb;      /* decoding only */
+    int             i_poc_lsb;      /* decoding only */
+
+    /* We use only one SPS and one PPS */
+    x264_sps_t      sps_array[32];
+    x264_sps_t      *sps;
+    x264_pps_t      pps_array[256];
+    x264_pps_t      *pps;
+    int             i_idr_pic_id;
+
+    /* Slice header */
+    x264_slice_header_t sh;
+
+    /* cabac context */
+    x264_cabac_t    cabac;
+
+    struct
+    {
+        /* Frames to be encoded */
+        x264_frame_t *current[X264_BFRAME_MAX+1];
+        /* Temporary buffer (eg B frames pending until we reach the I/P) */
+        x264_frame_t *next[X264_BFRAME_MAX+1];
+        /* Unused frames */
+        x264_frame_t *unused[X264_BFRAME_MAX+1];
+
+        /* frames used for reference +1 for decoding */
+        x264_frame_t *reference[16+1];
+
+        int i_last_idr; /* How many I non IDR frames from last IDR */
+        int i_last_i;   /* How many P/B frames from last I */
+    } frames;
+
+    /* current frame being encoded */
+    x264_frame_t    *fenc;
+
+    /* frame being reconstructed */
+    x264_frame_t    *fdec;
+
+    /* references lists */
+    int             i_ref0;
+    x264_frame_t    *fref0[16];       /* ref list 0 */
+    int             i_ref1;
+    x264_frame_t    *fref1[16];       /* ref list 1 */
+
+
+
+    /* Current MB DCT coeffs */
+    struct
+    {
+        DECLARE_ALIGNED( int, luma16x16_dc[16], 16 );
+        DECLARE_ALIGNED( int, chroma_dc[2][4], 16 );
+        struct
+        {
+            DECLARE_ALIGNED( int, residual_ac[15], 16 );
+            DECLARE_ALIGNED( int, luma4x4[16], 16 );
+        } block[16+8];
+    } dct;
+
+    /* MB table and cache for current frame/mb */
+    struct
+    {
+        /* Strides */
+        int     i_mb_stride;
+
+        /* Current index */
+        int     i_mb_x;
+        int     i_mb_y;
+        int     i_mb_xy;
+
+        unsigned int i_neighbour;
+
+        /* mb table */
+        int8_t  *type;                      /* mb type */
+        int8_t  *qp;                        /* mb qp */
+        int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
+        int8_t  (*intra4x4_pred_mode)[7];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+        uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
+        int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
+        int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
+        int16_t (*mvd[2])[2];               /* mb mv difference with predict. set to 0 if intra. cabac only */
+        int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only */
+
+        /* current value */
+        int     i_type;
+        int     i_partition;
+        int     i_sub_partition[4];
+
+        int     i_cbp_luma;
+        int     i_cbp_chroma;
+
+        int     i_intra16x16_pred_mode;
+        int     i_chroma_pred_mode;
+
+        struct
+        {
+            /* pointer over mb of the frame to be compressed */
+            uint8_t *p_fenc[3];
+
+            /* pointer over mb of the frame to be reconstrucated  */
+            uint8_t *p_fdec[3];
+
+            /* pointer over mb of the references */
+            uint8_t *p_fref[2][16][3];
+
+            /* common stride */
+            int     i_stride[3];
+        } pic;
+
+        /* cache */
+        struct
+        {
+            /* real intra4x4_pred_mode if I_4X4, I_PRED_4x4_DC if mb available, -1 if not */
+            int     intra4x4_pred_mode[X264_SCAN8_SIZE];
+
+            /* i_non_zero_count if availble else 0x80 */
+            int     non_zero_count[X264_SCAN8_SIZE];
+
+            /* -1 if unused, -2 if unavaible */
+            int8_t  ref[2][X264_SCAN8_SIZE];
+
+            /* 0 if non avaible */
+            int16_t mv[2][X264_SCAN8_SIZE][2];
+            int16_t mvd[2][X264_SCAN8_SIZE][2];
+        } cache;
+
+        /* */
+        int     i_last_qp;  /* last qp */
+        int     i_last_dqp; /* last delta qp */
+
+    } mb;
+
+    /* rate control encoding only */
+    x264_ratecontrol_t *rc;
+
+    /* stats */
+    struct
+    {
+        /* per slice info */
+        int   i_slice_count[5];
+        int   i_slice_size[5];
+        float f_psnr_y[5];
+        float f_psnr_u[5];
+        float f_psnr_v[5];
+        int   i_mb_count[5][18];
+    } stat;
+
+    /* CPU functions dependants */
+    x264_predict_t      predict_16x16[4+3];
+    x264_predict_t      predict_8x8[4+3];
+    x264_predict_t      predict_4x4[9+3];
+
+    x264_pixel_function_t pixf;
+    x264_mc_function_t    mc[2];
+    x264_dct_function_t   dctf;
+    x264_csp_function_t   csp;
+
+    /* vlc table for decoding purpose only */
+    x264_vlc_table_t *x264_coeff_token_lookup[5];
+    x264_vlc_table_t *x264_level_prefix_lookup;
+    x264_vlc_table_t *x264_total_zeros_lookup[15];
+    x264_vlc_table_t *x264_total_zeros_dc_lookup[3];
+    x264_vlc_table_t *x264_run_before_lookup[7];
+};
+
+#endif
+
diff --git a/core/cpu.c b/core/cpu.c
new file mode 100644
index 00000000..8e6d1e3e
--- /dev/null
+++ b/core/cpu.c
@@ -0,0 +1,233 @@
+/*****************************************************************************
+ * cpu.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cpu.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "cpu.h"
+
+#ifdef ARCH_X86
+extern int  x264_cpu_cpuid_test( void );
+extern uint32_t  x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
+extern void x264_emms( void );
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t cpu = 0;
+
+    uint32_t eax, ebx, ecx, edx;
+    int      b_amd;
+
+
+    if( !x264_cpu_cpuid_test() )
+    {
+        /* No cpuid */
+        return 0;
+    }
+
+    x264_cpu_cpuid( 0, &eax, &ebx, &ecx, &edx);
+    if( eax == 0 )
+    {
+        return 0;
+    }
+    b_amd   = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
+    if( (edx&0x00800000) == 0 )
+    {
+        /* No MMX */
+        return 0;
+    }
+    cpu = X264_CPU_MMX;
+    if( (edx&0x02000000) )
+    {
+        /* SSE - identical to AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
+    }
+    if( (edx&0x04000000) )
+    {
+        /* Is it OK ? */
+        cpu |= X264_CPU_SSE2;
+    }
+
+    x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
+    if( eax < 0x80000001 )
+    {
+        /* no extended capabilities */
+        return cpu;
+    }
+
+    x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
+    if( edx&0x80000000 )
+    {
+        cpu |= X264_CPU_3DNOW;
+    }
+    if( b_amd && (edx&0x00400000) )
+    {
+        /* AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT;
+    }
+
+    return cpu;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+    if( cpu&(X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_3DNOW|X264_CPU_3DNOWEXT) )
+    {
+        x264_emms();
+    }
+}
+
+
+#if 0
+/*
+ * XXX: adapted from libmpeg2 */
+#if 0
+#define cpuid(op,eax,ebx,ecx,edx)   \
+    __asm__ ("push %%ebx\n\t"       \
+             "cpuid\n\t"            \
+             "movl %%ebx,%1\n\t"    \
+             "pop %%ebx"        \
+             : "=a" (eax),      \
+               "=r" (ebx),      \
+               "=c" (ecx),      \
+               "=d" (edx)       \
+             : "a" (op)         \
+             : "cc")
+#endif
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t cpu = 0;
+
+    uint32_t eax, ebx, ecx, edx;
+    int      b_amd;
+
+
+    /* Test if cpuid is supported */
+    asm volatile(
+        "pushf\n"
+        "pushf\n"
+        "pop %0\n"
+        "movl %0,%1\n"
+        "xorl $0x200000,%0\n"
+        "push %0\n"
+        "popf\n"
+        "pushf\n"
+        "pop %0\n"
+        "popf\n"
+         : "=r" (eax), "=r" (ebx) : : "cc");
+
+    if( eax == ebx )
+    {
+        /* No cpuid */
+        return 0;
+    }
+
+    cpuid( 0, eax, ebx, ecx, edx);
+    if( eax == 0 )
+    {
+        return 0;
+    }
+    b_amd   = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+    cpuid( 1, eax, ebx, ecx, edx );
+    if( (edx&0x00800000) == 0 )
+    {
+        /* No MMX */
+        return 0;
+    }
+    cpu = X264_CPU_MMX;
+    if( (edx&0x02000000) )
+    {
+        /* SSE - identical to AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
+    }
+    if( (edx&0x04000000) )
+    {
+        /* Is it OK ? */
+        cpu |= X264_CPU_SSE2;
+    }
+
+    cpuid( 0x80000000, eax, ebx, ecx, edx );
+    if( eax < 0x80000001 )
+    {
+        /* no extended capabilities */
+        return cpu;
+    }
+
+    cpuid( 0x80000001, eax, ebx, ecx, edx );
+    if( edx&0x80000000 )
+    {
+        cpu |= X264_CPU_3DNOW;
+    }
+    if( b_amd && (edx&0x00400000) )
+    {
+        /* AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT;
+    }
+
+    return cpu;
+}
+#endif
+
+#elif defined( HAVE_ALTIVEC )
+#include <sys/sysctl.h>
+
+uint32_t x264_cpu_detect( void )
+{
+    /* Thx VLC */
+    uint32_t cpu = 0;
+    int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
+    int      has_altivec = 0;
+    size_t   length = sizeof( has_altivec );
+    int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
+
+    if( error == 0 && has_altivec != 0 )
+    {
+        cpu |= X264_CPU_ALTIVEC;
+    }
+
+    return cpu;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+}
+
+#else
+
+uint32_t x264_cpu_detect( void )
+{
+    return 0;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+}
+
+#endif
+
diff --git a/core/cpu.h b/core/cpu.h
new file mode 100644
index 00000000..a9df3f83
--- /dev/null
+++ b/core/cpu.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * cpu.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cpu.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CPU_H
+#define _CPU_H 1
+
+uint32_t x264_cpu_detect( void );
+
+/* probably MMX(EXT) centric but .... */
+void     x264_cpu_restore( uint32_t cpu );
+
+#endif
diff --git a/core/csp.c b/core/csp.c
new file mode 100644
index 00000000..1dda6b60
--- /dev/null
+++ b/core/csp.c
@@ -0,0 +1,379 @@
+/*****************************************************************************
+ * csp.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: csp.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common.h"
+
+static inline void plane_copy( uint8_t *dst, int i_dst,
+                               uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        memcpy( dst, src, w );
+        dst += i_dst;
+        src += i_src;
+    }
+}
+static inline void plane_copy_vflip( uint8_t *dst, int i_dst,
+                                     uint8_t *src, int i_src, int w, int h)
+{
+    plane_copy( dst, i_dst, src + (h -1)*i_src, -i_src, w, h );
+}
+
+static inline void plane_subsamplev2( uint8_t *dst, int i_dst,
+                                      uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        uint8_t *d = dst;
+        uint8_t *s = src;
+        int     i;
+        for( i = 0; i < w; i++ )
+        {
+            *d++ = ( s[0] + s[i_src] + 1 ) >> 1;
+            s++;
+        }
+        dst += i_dst;
+        src += 2 * i_src;
+    }
+}
+
+static inline void plane_subsamplev2_vlip( uint8_t *dst, int i_dst,
+                                           uint8_t *src, int i_src, int w, int h)
+{
+    plane_subsamplev2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
+}
+
+static inline void plane_subsamplehv2( uint8_t *dst, int i_dst,
+                                       uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        uint8_t *d = dst;
+        uint8_t *s = src;
+        int     i;
+        for( i = 0; i < w; i++ )
+        {
+            *d++ = ( s[0] + s[1] + s[i_src] + s[i_src+1] + 1 ) >> 2;
+            s += 2;
+        }
+        dst += i_dst;
+        src += 2 * i_src;
+    }
+}
+
+static inline void plane_subsamplehv2_vlip( uint8_t *dst, int i_dst,
+                                            uint8_t *src, int i_src, int w, int h)
+{
+    plane_subsamplehv2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
+}
+
+static void i420_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+        plane_copy_vflip( frm->plane[1], frm->i_stride[1],
+                          img->plane[1], img->i_stride[1],
+                          i_width / 2, i_height / 2 );
+        plane_copy_vflip( frm->plane[2], frm->i_stride[2],
+                          img->plane[2], img->i_stride[2],
+                          i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+        plane_copy( frm->plane[1], frm->i_stride[1],
+                    img->plane[1], img->i_stride[1],
+                    i_width / 2, i_height / 2 );
+        plane_copy( frm->plane[2], frm->i_stride[2],
+                    img->plane[2], img->i_stride[2],
+                    i_width / 2, i_height / 2 );
+    }
+}
+
+static void yv12_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+        plane_copy_vflip( frm->plane[2], frm->i_stride[2],
+                          img->plane[1], img->i_stride[1],
+                          i_width / 2, i_height / 2 );
+        plane_copy_vflip( frm->plane[1], frm->i_stride[1],
+                          img->plane[2], img->i_stride[2],
+                          i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+        plane_copy( frm->plane[2], frm->i_stride[2],
+                    img->plane[1], img->i_stride[1],
+                    i_width / 2, i_height / 2 );
+        plane_copy( frm->plane[1], frm->i_stride[1],
+                    img->plane[2], img->i_stride[2],
+                    i_width / 2, i_height / 2 );
+    }
+}
+
+static void i422_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+
+        plane_subsamplev2_vlip( frm->plane[1], frm->i_stride[1],
+                                img->plane[1], img->i_stride[1],
+                                i_width / 2, i_height / 2 );
+        plane_subsamplev2_vlip( frm->plane[2], frm->i_stride[2],
+                                img->plane[2], img->i_stride[2],
+                                i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+
+        plane_subsamplev2( frm->plane[1], frm->i_stride[1],
+                           img->plane[1], img->i_stride[1],
+                           i_width / 2, i_height / 2 );
+        plane_subsamplev2( frm->plane[2], frm->i_stride[2],
+                           img->plane[2], img->i_stride[2],
+                           i_width / 2, i_height / 2 );
+    }
+}
+
+static void i444_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+
+        plane_subsamplehv2_vlip( frm->plane[1], frm->i_stride[1],
+                                 img->plane[1], img->i_stride[1],
+                                 i_width / 2, i_height / 2 );
+        plane_subsamplehv2_vlip( frm->plane[2], frm->i_stride[2],
+                                 img->plane[2], img->i_stride[2],
+                                 i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+
+        plane_subsamplehv2( frm->plane[1], frm->i_stride[1],
+                            img->plane[1], img->i_stride[1],
+                            i_width / 2, i_height / 2 );
+        plane_subsamplehv2( frm->plane[2], frm->i_stride[2],
+                            img->plane[2], img->i_stride[2],
+                            i_width / 2, i_height / 2 );
+    }
+}
+static void yuyv_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    uint8_t *src = img->plane[0];
+    int     i_src= img->i_stride[0];
+
+    uint8_t *y   = frm->plane[0];
+    uint8_t *u   = frm->plane[1];
+    uint8_t *v   = frm->plane[2];
+
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        src += ( i_height - 1 ) * i_src;
+        i_src = -i_src;
+    }
+
+    for( ; i_height > 0; i_height -= 2 )
+    {
+        uint8_t *ss = src;
+        uint8_t *yy = y;
+        uint8_t *uu = u;
+        uint8_t *vv = v;
+        int w;
+
+        for( w = i_width; w > 0; w -= 2 )
+        {
+            *yy++ = ss[0];
+            *yy++ = ss[2];
+
+            *uu++ = ( ss[1] + ss[1+i_src] + 1 ) >> 1;
+            *vv++ = ( ss[3] + ss[3+i_src] + 1 ) >> 1;
+
+            ss += 4;
+        }
+        src += i_src;
+        y += frm->i_stride[0];
+        u += frm->i_stride[1];
+        v += frm->i_stride[2];
+
+        ss = src;
+        yy = y;
+        for( w = i_width; w > 0; w -= 2 )
+        {
+            *yy++ = ss[0];
+            *yy++ = ss[2];
+            ss += 4;
+        }
+        src += i_src;
+        y += frm->i_stride[0];
+    }
+}
+
+/* Same value than in XviD */
+#define BITS 8
+#define FIX(f) ((int)((f) * (1 << BITS) + 0.5))
+
+#define Y_R   FIX(0.257)
+#define Y_G   FIX(0.504)
+#define Y_B   FIX(0.098)
+#define Y_ADD 16
+
+#define U_R   FIX(0.148)
+#define U_G   FIX(0.291)
+#define U_B   FIX(0.439)
+#define U_ADD 128
+
+#define V_R   FIX(0.439)
+#define V_G   FIX(0.368)
+#define V_B   FIX(0.071)
+#define V_ADD 128
+#define RGB_TO_I420( name, POS_R, POS_G, POS_B, S_RGB ) \
+static void name( x264_frame_t *frm, x264_image_t *img, \
+                  int i_width, int i_height )           \
+{                                                       \
+    uint8_t *src = img->plane[0];                       \
+    int     i_src= img->i_stride[0];                    \
+    int     i_y  = frm->i_stride[0];                    \
+    uint8_t *y   = frm->plane[0];                       \
+    uint8_t *u   = frm->plane[1];                       \
+    uint8_t *v   = frm->plane[2];                       \
+                                                        \
+    if( img->i_csp & X264_CSP_VFLIP )                   \
+    {                                                   \
+        src += ( i_height - 1 ) * i_src;                \
+        i_src = -i_src;                                 \
+    }                                                   \
+                                                        \
+    for(  ; i_height > 0; i_height -= 2 )               \
+    {                                                   \
+        uint8_t *ss = src;                              \
+        uint8_t *yy = y;                                \
+        uint8_t *uu = u;                                \
+        uint8_t *vv = v;                                \
+        int w;                                          \
+                                                        \
+        for( w = i_width; w > 0; w -= 2 )               \
+        {                                               \
+            int cr = 0,cg = 0,cb = 0;                   \
+            int r, g, b;                                \
+                                                        \
+            /* Luma */                                  \
+            cr = r = ss[POS_R];                         \
+            cg = g = ss[POS_G];                         \
+            cb = b = ss[POS_B];                         \
+                                                        \
+            yy[0] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);    \
+                                                        \
+            cr+= r = ss[POS_R+i_src];                   \
+            cg+= g = ss[POS_G+i_src];                   \
+            cb+= b = ss[POS_B+i_src];                   \
+            yy[i_y] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);  \
+            yy++;                                       \
+            ss += S_RGB;                                \
+                                                        \
+            cr+= r = ss[POS_R];                         \
+            cg+= g = ss[POS_G];                         \
+            cb+= b = ss[POS_B];                         \
+                                                        \
+            yy[0] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);    \
+                                                        \
+            cr+= r = ss[POS_R+i_src];                   \
+            cg+= g = ss[POS_G+i_src];                   \
+            cb+= b = ss[POS_B+i_src];                   \
+            yy[i_y] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);  \
+            yy++;                                       \
+            ss += S_RGB;                                \
+                                                        \
+            /* Chroma */                                \
+            *uu++ = (uint8_t)(U_ADD + ((-U_R * cr - U_G * cg + U_B * cb) >> (BITS+2)) ); \
+            *vv++ = (uint8_t)(V_ADD + (( V_R * cr - V_G * cg - V_B * cb) >> (BITS+2)) ); \
+        }                                               \
+                                                        \
+        src += 2*i_src;                                   \
+        y += 2*frm->i_stride[0];                        \
+        u += frm->i_stride[1];                          \
+        v += frm->i_stride[2];                          \
+    }                                                   \
+}
+
+RGB_TO_I420( rgb_to_i420,  0, 1, 2, 3 );
+RGB_TO_I420( bgr_to_i420,  2, 1, 0, 3 );
+RGB_TO_I420( bgra_to_i420, 2, 1, 0, 4 );
+
+void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf )
+{
+    switch( i_csp )
+    {
+        case X264_CSP_I420:
+            pf->i420 = i420_to_i420;
+            pf->i422 = i422_to_i420;
+            pf->i444 = i444_to_i420;
+            pf->yv12 = yv12_to_i420;
+            pf->yuyv = yuyv_to_i420;
+            pf->rgb  = rgb_to_i420;
+            pf->bgr  = bgr_to_i420;
+            pf->bgra = bgra_to_i420;
+            break;
+
+        default:
+            /* For now, can't happen */
+            fprintf( stderr, "arg in x264_csp_init\n" );
+            exit( -1 );
+            break;
+    }
+}
+
diff --git a/core/csp.h b/core/csp.h
new file mode 100644
index 00000000..1b02795a
--- /dev/null
+++ b/core/csp.h
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * csp.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: csp.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CSP_H
+#define _CSP_H 1
+
+typedef struct
+{
+    void (*i420)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*i422)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*i444)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*yv12)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*yuyv)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*rgb )( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*bgr )( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*bgra)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+} x264_csp_function_t;
+
+
+void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf );
+
+#endif
+
diff --git a/core/dct.c b/core/dct.c
new file mode 100644
index 00000000..65aab0cf
--- /dev/null
+++ b/core/dct.c
@@ -0,0 +1,288 @@
+/*****************************************************************************
+ * dct.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+
+#include "dct.h"
+#ifdef HAVE_MMXEXT
+#   include "i386/dct.h"
+#endif
+
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/*
+ * XXX For all dct dc : input could be equal to output so ...
+ */
+
+static void dct2x2dc( int16_t d[2][2] )
+{
+    int tmp[2][2];
+
+    tmp[0][0] = d[0][0] + d[0][1];
+    tmp[1][0] = d[0][0] - d[0][1];
+    tmp[0][1] = d[1][0] + d[1][1];
+    tmp[1][1] = d[1][0] - d[1][1];
+
+    d[0][0] = tmp[0][0] + tmp[0][1];
+    d[0][1] = tmp[1][0] + tmp[1][1];
+    d[1][0] = tmp[0][0] - tmp[0][1];
+    d[1][1] = tmp[1][0] - tmp[1][1];
+}
+
+static void dct4x4dc( int16_t d[4][4] )
+{
+    int16_t tmp[4][4];
+    int s01, s23;
+    int d01, d23;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = d[i][0] + d[i][1];
+        d01 = d[i][0] - d[i][1];
+        s23 = d[i][2] + d[i][3];
+        d23 = d[i][2] - d[i][3];
+
+        tmp[0][i] = s01 + s23;
+        tmp[1][i] = s01 - s23;
+        tmp[2][i] = d01 - d23;
+        tmp[3][i] = d01 + d23;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = tmp[i][0] + tmp[i][1];
+        d01 = tmp[i][0] - tmp[i][1];
+        s23 = tmp[i][2] + tmp[i][3];
+        d23 = tmp[i][2] - tmp[i][3];
+
+        d[0][i] = ( s01 + s23 + 1 ) >> 1;
+        d[1][i] = ( s01 - s23 + 1 ) >> 1;
+        d[2][i] = ( d01 - d23 + 1 ) >> 1;
+        d[3][i] = ( d01 + d23 + 1 ) >> 1;
+    }
+}
+
+static void idct4x4dc( int16_t d[4][4] )
+{
+    int16_t tmp[4][4];
+    int s01, s23;
+    int d01, d23;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = d[0][i] + d[1][i];
+        d01 = d[0][i] - d[1][i];
+        s23 = d[2][i] + d[3][i];
+        d23 = d[2][i] - d[3][i];
+
+        tmp[0][i] = s01 + s23;
+        tmp[1][i] = s01 - s23;
+        tmp[2][i] = d01 - d23;
+        tmp[3][i] = d01 + d23;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = tmp[i][0] + tmp[i][1];
+        d01 = tmp[i][0] - tmp[i][1];
+        s23 = tmp[i][2] + tmp[i][3];
+        d23 = tmp[i][2] - tmp[i][3];
+
+        d[i][0] = s01 + s23;
+        d[i][1] = s01 - s23;
+        d[i][2] = d01 - d23;
+        d[i][3] = d01 + d23;
+    }
+}
+
+static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    int16_t d[4][4];
+    int16_t tmp[4][4];
+    int y, x;
+    int i;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            d[y][x] = pix1[x] - pix2[x];
+        }
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s03 = d[i][0] + d[i][3];
+        const int s12 = d[i][1] + d[i][2];
+        const int d03 = d[i][0] - d[i][3];
+        const int d12 = d[i][1] - d[i][2];
+
+        tmp[0][i] =   s03 +   s12;
+        tmp[1][i] = 2*d03 +   d12;
+        tmp[2][i] =   s03 -   s12;
+        tmp[3][i] =   d03 - 2*d12;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s03 = tmp[i][0] + tmp[i][3];
+        const int s12 = tmp[i][1] + tmp[i][2];
+        const int d03 = tmp[i][0] - tmp[i][3];
+        const int d12 = tmp[i][1] - tmp[i][2];
+
+        dct[0][i] =   s03 +   s12;
+        dct[1][i] = 2*d03 +   d12;
+        dct[2][i] =   s03 -   s12;
+        dct[3][i] =   d03 - 2*d12;
+    }
+}
+
+static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    sub4x4_dct( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    sub4x4_dct( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
+    sub4x4_dct( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
+    sub4x4_dct( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+}
+
+static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    sub8x8_dct( &dct[ 0], pix1, i_pix1, pix2, i_pix2 );
+    sub8x8_dct( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
+    sub8x8_dct( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
+    sub8x8_dct( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+}
+
+
+static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+{
+    int16_t d[4][4];
+    int16_t tmp[4][4];
+    int x, y;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s02 = dct[0][i]      + dct[2][i];
+        const int d02 = dct[0][i]      - dct[2][i];
+        const int s13 = dct[1][i]      + (dct[3][i]>>1);
+        const int d13 = (dct[1][i]>>1) -  dct[3][i];
+
+        tmp[0][i] = s02 + s13;
+        tmp[1][i] = d02 + d13;
+        tmp[2][i] = d02 - d13;
+        tmp[3][i] = s02 - s13;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s02 =  tmp[i][0]     +  tmp[i][2];
+        const int d02 =  tmp[i][0]     -  tmp[i][2];
+        const int s13 =  tmp[i][1]     + (tmp[i][3]>>1);
+        const int d13 = (tmp[i][1]>>1) -  tmp[i][3];
+
+        d[i][0] = ( s02 + s13 + 32 ) >> 6;
+        d[i][1] = ( d02 + d13 + 32 ) >> 6;
+        d[i][2] = ( d02 - d13 + 32 ) >> 6;
+        d[i][3] = ( s02 - s13 + 32 ) >> 6;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            p_dst[x] = clip_uint8( p_dst[x] + d[y][x] );
+        }
+        p_dst += i_dst;
+    }
+}
+
+static void add8x8_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+{
+    add4x4_idct( p_dst, i_dst,             dct[0] );
+    add4x4_idct( &p_dst[4], i_dst,         dct[1] );
+    add4x4_idct( &p_dst[4*i_dst+0], i_dst, dct[2] );
+    add4x4_idct( &p_dst[4*i_dst+4], i_dst, dct[3] );
+}
+
+static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+{
+    add8x8_idct( &p_dst[0], i_dst, &dct[0] );
+    add8x8_idct( &p_dst[8], i_dst, &dct[4] );
+    add8x8_idct( &p_dst[8*i_dst], i_dst, &dct[8] );
+    add8x8_idct( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+}
+
+
+
+/****************************************************************************
+ * x264_dct_init:
+ ****************************************************************************/
+void x264_dct_init( int cpu, x264_dct_function_t *dctf )
+{
+    dctf->sub4x4_dct    = sub4x4_dct;
+    dctf->add4x4_idct   = add4x4_idct;
+
+    dctf->sub8x8_dct    = sub8x8_dct;
+    dctf->add8x8_idct   = add8x8_idct;
+
+    dctf->sub16x16_dct    = sub16x16_dct;
+    dctf->add16x16_idct   = add16x16_idct;
+
+    dctf->dct4x4dc  = dct4x4dc;
+    dctf->idct4x4dc = idct4x4dc;
+
+    dctf->dct2x2dc  = dct2x2dc;
+    dctf->idct2x2dc = dct2x2dc;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        dctf->sub4x4_dct    = x264_sub4x4_dct_mmxext;
+        dctf->sub8x8_dct    = x264_sub8x8_dct_mmxext;
+        dctf->sub16x16_dct  = x264_sub16x16_dct_mmxext;
+
+        dctf->add4x4_idct   = x264_add4x4_idct_mmxext;
+        dctf->add8x8_idct   = x264_add8x8_idct_mmxext;
+        dctf->add16x16_idct = x264_add16x16_idct_mmxext;
+
+        dctf->dct4x4dc  = x264_dct4x4dc_mmxext;
+        dctf->idct4x4dc = x264_idct4x4dc_mmxext;
+    }
+#endif
+}
+
diff --git a/core/dct.h b/core/dct.h
new file mode 100644
index 00000000..bedbbf43
--- /dev/null
+++ b/core/dct.h
@@ -0,0 +1,49 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DCT_H
+#define _DCT_H 1
+
+typedef struct
+{
+    void (*sub4x4_dct)   ( int16_t dct[4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add4x4_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+
+    void (*sub8x8_dct)   ( int16_t dct[4][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add8x8_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+
+    void (*sub16x16_dct)   ( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add16x16_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+
+
+    void (*dct4x4dc) ( int16_t d[4][4] );
+    void (*idct4x4dc)( int16_t d[4][4] );
+
+    void (*dct2x2dc) ( int16_t d[2][2] );
+    void (*idct2x2dc)( int16_t d[2][2] );
+
+} x264_dct_function_t;
+
+void x264_dct_init( int cpu, x264_dct_function_t *dctf );
+
+#endif
diff --git a/core/frame.c b/core/frame.c
new file mode 100644
index 00000000..2e926176
--- /dev/null
+++ b/core/frame.c
@@ -0,0 +1,701 @@
+/*****************************************************************************
+ * frame.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: frame.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "common.h"
+#include "macroblock.h"
+
+x264_frame_t *x264_frame_new( x264_t *h )
+{
+    x264_frame_t   *frame = x264_malloc( sizeof( x264_frame_t ) );
+    int i;
+
+    int i_stride;
+    int i_lines;
+
+    /* allocate frame data (+64 for extra data for me) */
+    i_stride = ( ( h->param.i_width  + 15 )&0xfffff0 )+ 64;
+    i_lines  = ( ( h->param.i_height + 15 )&0xfffff0 );
+
+    frame->i_plane = 3;
+    for( i = 0; i < 3; i++ )
+    {
+        int i_divh = 1;
+        int i_divw = 1;
+        if( i > 0 )
+        {
+            if( h->param.i_csp == X264_CSP_I420 )
+                i_divh = i_divw = 2;
+            else if( h->param.i_csp == X264_CSP_I422 )
+                i_divw = 2;
+        }
+        frame->i_stride[i] = i_stride / i_divw;
+        frame->i_lines[i] = i_lines / i_divh;
+        frame->buffer[i] = x264_malloc( frame->i_stride[i] *
+                                        ( frame->i_lines[i] + 64 / i_divh ) );
+
+        frame->plane[i] = ((uint8_t*)frame->buffer[i]) +
+                          frame->i_stride[i] * 32 / i_divh + 32 / i_divw;
+    }
+    frame->i_stride[3] = 0;
+    frame->i_lines[3] = 0;
+    frame->buffer[3] = NULL;
+    frame->plane[3] = NULL;
+
+    frame->i_poc = -1;
+    frame->i_type = X264_TYPE_AUTO;
+    frame->i_qpplus1 = 0;
+
+    return frame;
+}
+
+void x264_frame_delete( x264_frame_t *frame )
+{
+    int i;
+    for( i = 0; i < frame->i_plane; i++ )
+    {
+        x264_free( frame->buffer[i] );
+    }
+    x264_free( frame );
+}
+
+void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
+{
+    dst->i_type     = src->i_type;
+    dst->i_qpplus1  = src->i_qpplus1;
+    dst->i_pts      = src->i_pts;
+
+    switch( src->img.i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+            h->csp.i420( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_YV12:
+            h->csp.yv12( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_I422:
+            h->csp.i422( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_I444:
+            h->csp.i444( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_YUYV:
+            h->csp.yuyv( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_RGB:
+            h->csp.rgb( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_BGR:
+            h->csp.bgr( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_BGRA:
+            h->csp.bgra( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+
+        default:
+            fprintf( stderr, "Arg invalid CSP\n" );
+            break;
+    }
+}
+
+
+
+void x264_frame_expand_border( x264_frame_t *frame )
+{
+    int w;
+    int i, y;
+    for( i = 0; i < frame->i_plane; i++ )
+    {
+#define PPIXEL(x, y) ( frame->plane[i] + (x) +(y)*frame->i_stride[i] )
+        w = ( i == 0 ) ? 32 : 16;
+
+        for( y = 0; y < w; y++ )
+        {
+            /* upper band */
+            memcpy( PPIXEL(0,-y-1), PPIXEL(0,0), frame->i_stride[i] - 2 * w);
+            /* up left corner */
+            memset( PPIXEL(-w,-y-1 ), PPIXEL(0,0)[0], w );
+            /* up right corner */
+            memset( PPIXEL(frame->i_stride[i] - 2*w,-y-1), PPIXEL( frame->i_stride[i]-1-2*w,0)[0], w );
+
+            /* lower band */
+            memcpy( PPIXEL(0, frame->i_lines[i]+y), PPIXEL(0,frame->i_lines[i]-1), frame->i_stride[i] - 2 * w );
+            /* low left corner */
+            memset( PPIXEL(-w, frame->i_lines[i]+y), PPIXEL(0,frame->i_lines[i]-1)[0], w);
+            /* low right corner */
+            memset( PPIXEL(frame->i_stride[i]-2*w, frame->i_lines[i]+y), PPIXEL(frame->i_stride[i]-1-2*w,frame->i_lines[i]-1)[0], w);
+
+        }
+        for( y = 0; y < frame->i_lines[i]; y++ )
+        {
+            /* left band */
+            memset( PPIXEL( -w, y ), PPIXEL( 0, y )[0], w );
+            /* right band */
+            memset( PPIXEL( frame->i_stride[i]-2*w, y ), PPIXEL( frame->i_stride[i] - 1-2*w, y )[0], w );
+        }
+#undef PPIXEL
+    }
+}
+
+/* FIXME theses tables are duplicated with the ones in macroblock.c */
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+/* Deblocking filter (p153) */
+static const int i_alpha_table[52] =
+{
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+    80, 90,101,113,127,144,162,182,203,226,
+    255, 255
+};
+static const int i_beta_table[52] =
+{
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18
+};
+static const int i_tc0_table[52][3] =
+{
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
+    { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
+    { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
+    { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
+    { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
+    { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
+    { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 }
+};
+
+/* From ffmpeg */
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+static inline void deblocking_filter_edgev( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 4 * i_pix_stride;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
+
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int p2 = pix[-3];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+                const int q2 = pix[2];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int tc = tc0;
+                    int i_delta;
+
+                    if( abs( p2 - p0 ) < beta )
+                    {
+                        pix[-2] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+                    if( abs( q2 - q0 ) < beta )
+                    {
+                        pix[1] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+
+                    i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                    pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+        else
+        {
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int p2 = pix[-3];
+
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+                const int q2 = pix[2];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) )
+                    {
+                        if( abs( p2 - p0 ) < beta )
+                        {
+                            const int p3 = pix[-4];
+                            /* p0', p1', p2' */
+                            pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                            pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                            pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* p0' */
+                            pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        }
+                        if( abs( q2 - q0 ) < beta )
+                        {
+                            const int q3 = pix[3];
+                            /* q0', q1', q2' */
+                            pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                            pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                            pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* q0' */
+                            pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                        }
+                    }
+                    else
+                    {
+                        /* p0', q0' */
+                        pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }
+                pix += i_pix_stride;
+            }
+        }
+    }
+}
+
+static inline void deblocking_filter_edgecv( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 2 * i_pix_stride;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1;
+            /* 2px edge length (because we use same bS than the one for luma) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    const int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                    pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+        else
+        {
+            /* 2px edge length (because we use same bS than the one for luma) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                    pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+    }
+}
+
+static inline void deblocking_filter_edgeh( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    int i_pix_next  = i_pix_stride;
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 4;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int p2 = pix[-3*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+                const int q2 = pix[2*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int tc = tc0;
+                    int i_delta;
+
+                    if( abs( p2 - p0 ) < beta )
+                    {
+                        pix[-2*i_pix_next] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+                    if( abs( q2 - q0 ) < beta )
+                    {
+                        pix[i_pix_next] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+
+                    i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                    pix[-i_pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]           = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix++;
+            }
+        }
+        else
+        {
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int p2 = pix[-3*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+                const int q2 = pix[2*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    const int p3 = pix[-4*i_pix_next];
+                    const int q3 = pix[ 3*i_pix_next];
+
+                    if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) )
+                    {
+                        if( abs( p2 - p0 ) < beta )
+                        {
+                            /* p0', p1', p2' */
+                            pix[-1*i_pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                            pix[-2*i_pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                            pix[-3*i_pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* p0' */
+                            pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        }
+                        if( abs( q2 - q0 ) < beta )
+                        {
+                            /* q0', q1', q2' */
+                            pix[0*i_pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                            pix[1*i_pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                            pix[2*i_pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* q0' */
+                            pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                        }
+                    }
+                    else
+                    {
+                        /* p0' */
+                        pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        /* q0' */
+                        pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }
+                pix++;
+            }
+
+        }
+    }
+}
+
+static inline void deblocking_filter_edgech( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    int i_pix_next  = i_pix_stride;
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 2;
+            continue;
+        }
+        if( bS[i] < 4 )
+        {
+            int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1;
+            /* 2px edge length (see deblocking_filter_edgecv) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1*i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                    pix[-i_pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]           = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix++;
+            }
+        }
+        else
+        {
+            /* 2px edge length (see deblocking_filter_edgecv) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1*i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    pix[-i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                    pix[0]           = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                }
+                pix++;
+            }
+        }
+    }
+}
+
+void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
+{
+    const int s8x8 = 2 * h->mb.i_mb_stride;
+    const int s4x4 = 4 * h->mb.i_mb_stride;
+
+    int mb_y, mb_x;
+
+    for( mb_y = 0, mb_x = 0; mb_y < h->sps->i_mb_height; )
+    {
+        const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
+        const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
+        const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
+        int i_edge;
+        int i_dir;
+
+        /* i_dir == 0 -> vertical edge
+         * i_dir == 1 -> horizontal edge */
+        for( i_dir = 0; i_dir < 2; i_dir++ )
+        {
+            int i_start;
+            int i_qp, i_qpn;
+
+            i_start = (( i_dir == 0 && mb_x != 0 ) || ( i_dir == 1 && mb_y != 0 ) ) ? 0 : 1;
+
+            for( i_edge = i_start; i_edge < 4; i_edge++ )
+            {
+                int mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );
+                int mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );
+                int mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );
+
+                int bS[4];  /* filtering strength */
+
+                /* *** Get bS for each 4px for the current edge *** */
+                if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )
+                {
+                    bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 ? 4 : 3 );
+                }
+                else
+                {
+                    int i;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        int x  = i_dir == 0 ? i_edge : i;
+                        int y  = i_dir == 0 ? i      : i_edge;
+                        int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;
+                        int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;
+
+                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||
+                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )
+                        {
+                            bS[i] = 2;
+                        }
+                        else if( i_slice_type == SLICE_TYPE_P )
+                        {
+                            if( h->mb.ref[0][mb_8x8+(x/2)+(y/2)*s8x8] != h->mb.ref[0][mbn_8x8+(xn/2)+(yn/2)*s8x8] ||
+                                abs( h->mb.mv[0][mb_4x4+x+y*s4x4][0] - h->mb.mv[0][mbn_4x4+xn+yn*s4x4][0] ) >= 4 ||
+                                abs( h->mb.mv[0][mb_4x4+x+y*s4x4][1] - h->mb.mv[0][mbn_4x4+xn+yn*s4x4][1] ) >= 4 )
+                            {
+                                bS[i] = 1;
+                            }
+                            else
+                            {
+                                bS[i] = 0;
+                            }
+                        }
+                        else
+                        {
+                            /* FIXME */
+                            fprintf( stderr, "deblocking filter doesn't work yet with B slice\n" );
+                            return;
+                        }
+                    }
+                }
+
+                /* *** filter *** */
+                /* Y plane */
+                i_qp = h->mb.qp[mb_xy];
+                i_qpn= h->mb.qp[mbn_xy];
+
+                if( i_dir == 0 )
+                {
+                    /* vertical edge */
+                    deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge],
+                                                h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1);
+                    if( (i_edge % 2) == 0  )
+                    {
+                        /* U/V planes */
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
+                        deblocking_filter_edgecv( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2],
+                                                      h->fdec->i_stride[1], bS, i_qpc );
+                        deblocking_filter_edgecv( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2],
+                                                  h->fdec->i_stride[2], bS, i_qpc );
+                    }
+                }
+                else
+                {
+                    /* horizontal edge */
+                    deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x],
+                                                h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 );
+                    /* U/V planes */
+                    if( ( i_edge % 2  ) == 0 )
+                    {
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
+                        deblocking_filter_edgech( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2*h->fdec->i_stride[1]],
+                                                 h->fdec->i_stride[1], bS, i_qpc );
+                        deblocking_filter_edgech( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2*h->fdec->i_stride[2]],
+                                                 h->fdec->i_stride[2], bS, i_qpc );
+                    }
+                }
+            }
+        }
+
+        /* newt mb */
+        mb_x++;
+        if( mb_x >= h->sps->i_mb_width )
+        {
+            mb_x = 0;
+            mb_y++;
+        }
+    }
+}
+
+
+
+
diff --git a/core/frame.h b/core/frame.h
new file mode 100644
index 00000000..da7c4576
--- /dev/null
+++ b/core/frame.h
@@ -0,0 +1,56 @@
+/*****************************************************************************
+ * frame.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: frame.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _FRAME_H
+#define _FRAME_H 1
+
+typedef struct
+{
+    /* */
+    int     i_poc;
+    int     i_type;
+    int     i_qpplus1;
+    int64_t i_pts;
+
+    /* YUV buffer */
+    int     i_plane;
+    int     i_stride[4];
+    int     i_lines[4];
+    uint8_t *plane[4];
+
+    /* for unrestricted mv we allocate more data than needed
+     * allocated data are stored in buffer */
+    void    *buffer[4];
+
+} x264_frame_t;
+
+x264_frame_t *x264_frame_new( x264_t *h );
+void          x264_frame_delete( x264_frame_t *frame );
+
+void          x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
+
+void          x264_frame_expand_border( x264_frame_t *frame );
+
+void          x264_frame_deblocking_filter( x264_t *h, int i_slice_type );
+
+#endif
diff --git a/core/i386/cpu.asm b/core/i386/cpu.asm
new file mode 100644
index 00000000..06ac1e05
--- /dev/null
+++ b/core/i386/cpu.asm
@@ -0,0 +1,111 @@
+;*****************************************************************************
+;* cpu.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_cpu_cpuid_test
+cglobal x264_cpu_cpuid
+cglobal x264_emms
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported
+;-----------------------------------------------------------------------------
+x264_cpu_cpuid_test:
+    pushfd
+    push    ebx
+
+    pushfd
+    pop     eax
+    mov     ebx, eax
+    xor     eax, 0x200000
+    push    eax
+    popfd
+    pushfd
+    pop     eax
+    xor     eax, ebx
+    
+    pop     ebx
+    popfd
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+;-----------------------------------------------------------------------------
+x264_cpu_cpuid:
+
+    push    ebp
+    mov     ebp,    esp
+    push    ebx
+    push    esi
+    push    edi
+    
+    mov     eax,    [ebp +  8]
+    cpuid
+
+    mov     esi,    [ebp + 12]
+    mov     [esi],  eax
+
+    mov     esi,    [ebp + 16]
+    mov     [esi],  ebx
+
+    mov     esi,    [ebp + 20]
+    mov     [esi],  ecx
+
+    mov     esi,    [ebp + 24]
+    mov     [esi],  edx
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_emms( void )
+;-----------------------------------------------------------------------------
+x264_emms:
+    emms
+    ret
+
diff --git a/core/i386/dct-c.c b/core/i386/dct-c.c
new file mode 100644
index 00000000..d824a23f
--- /dev/null
+++ b/core/i386/dct-c.c
@@ -0,0 +1,294 @@
+/*****************************************************************************
+ * dct.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+
+#include "../dct.h"
+#include "dct.h"
+
+
+#if 0
+#define MMX_ZERO( MMZ ) \
+    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
+
+/* MMP : diff,  MMT: temp */
+#define MMX_LOAD_DIFF_4P( MMP, MMT, MMZ, pix1, pix2 ) \
+    asm volatile( "movd (%0), " #MMP "\n" \
+                  "punpcklbw  " #MMZ ", " #MMP "\n" \
+                  "movd (%1), " #MMT "\n" \
+                  "punpcklbw  " #MMZ ", " #MMT "\n" \
+                  "psubw      " #MMT ", " #MMP "\n" : : "r"(pix1), "r"(pix2) )
+
+/* in: out: mma=mma+mmb, mmb=mmb-mma */
+#define MMX_SUMSUB_BA( MMA, MMB ) \
+    asm volatile( "paddw " #MMB ", " #MMA "\n"\
+                  "paddw " #MMB ", " #MMB "\n"\
+                  "psubw " #MMA ", " #MMB "\n" :: )
+
+#define MMX_SUMSUB_BADC( MMA, MMB, MMC, MMD ) \
+    asm volatile( "paddw " #MMB ", " #MMA "\n"\
+                  "paddw " #MMD ", " #MMC "\n"\
+                  "paddw " #MMB ", " #MMB "\n"\
+                  "paddw " #MMD ", " #MMD "\n"\
+                  "psubw " #MMA ", " #MMB "\n"\
+                  "psubw " #MMC ", " #MMD "\n" :: )
+
+/* inputs MMA, MMB output MMA MMT */
+#define MMX_SUMSUB2_AB( MMA, MMB, MMT ) \
+    asm volatile( "movq  " #MMA ", " #MMT "\n" \
+                  "paddw " #MMA ", " #MMA "\n" \
+                  "paddw " #MMB ", " #MMA "\n" \
+                  "psubw " #MMB ", " #MMT "\n" \
+                  "psubw " #MMB ", " #MMT "\n" :: )
+
+/* inputs MMA, MMB output MMA MMS */
+#define MMX_SUMSUBD2_AB( MMA, MMB, MMT, MMS ) \
+    asm volatile( "movq  " #MMA ", " #MMS "\n" \
+                  "movq  " #MMB ", " #MMT "\n" \
+                  "psraw   $1    , " #MMB "\n"       \
+                  "psraw   $1    , " #MMS "\n"       \
+                  "paddw " #MMB ", " #MMA "\n" \
+                  "psubw " #MMT ", " #MMS "\n" :: )
+
+#define SBUTTERFLYwd(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpcklwd " #b ", " #a "   \n\t" \
+                  "punpckhwd " #b ", " #t "   \n\t" :: )
+
+#define SBUTTERFLYdq(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpckldq " #b ", " #a "   \n\t" \
+                  "punpckhdq " #b ", " #t "   \n\t" :: )
+
+/* input ABCD output ADTC */
+#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
+        SBUTTERFLYwd( MMA, MMB, MMT ); \
+        SBUTTERFLYwd( MMC, MMD, MMB ); \
+        SBUTTERFLYdq( MMA, MMC, MMD ); \
+        SBUTTERFLYdq( MMT, MMB, MMC )
+
+#define MMX_STORE_DIFF_4P( MMP, MMT, MM32, MMZ, dst ) \
+    asm volatile( "paddw     " #MM32 "," #MMP "\n" \
+                  "psraw       $6,     " #MMP "\n" \
+                  "movd        (%0),   " #MMT "\n" \
+                  "punpcklbw " #MMZ ", " #MMT "\n" \
+                  "paddsw    " #MMT ", " #MMP "\n" \
+                  "packuswb  " #MMZ ", " #MMP "\n" \
+                  "movd      " #MMP ",   (%0)\n" :: "r"(dst) )
+
+#define UNUSED_LONGLONG( foo ) \
+    static const unsigned long long foo __asm__ (#foo)  __attribute__((unused)) __attribute__((aligned(16)))
+
+UNUSED_LONGLONG( x264_mmx_32 ) = 0x0020002000200020ULL;
+UNUSED_LONGLONG( x264_mmx_1 ) = 0x0001000100010001ULL;
+
+
+/*
+ * XXX For all dct dc : input could be equal to output so ...
+ */
+void x264_dct4x4dc_mmxext( int16_t d[4][4] )
+{
+    /* load DCT */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n" :: "r"(d) );
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 );  /* mm1=s01  mm0=d01  mm3=s23  mm2=d23 */
+    MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 );  /* mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23 */
+
+    /* in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 */
+    MMX_TRANSPOSE  ( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 );  /* mm2=s01  mm3=d01  mm0=s23  mm4=d23 */
+    MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 );  /* mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23 */
+
+    /* in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3 */
+    MMX_TRANSPOSE  ( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
+
+
+    asm volatile( "movq x264_mmx_1, %%mm6" :: );
+
+    /* Store back */
+    asm volatile(
+        "paddw %%mm6, %%mm0\n"
+        "paddw %%mm6, %%mm4\n"
+
+        "psraw $1,    %%mm0\n"
+        "movq  %%mm0,   (%0)\n"
+        "psraw $1,    %%mm4\n"
+        "movq  %%mm4,  8(%0)\n"
+
+        "paddw %%mm6, %%mm1\n"
+        "paddw %%mm6, %%mm3\n"
+
+        "psraw $1,    %%mm1\n"
+        "movq  %%mm1, 16(%0)\n"
+        "psraw $1,    %%mm3\n"
+        "movq  %%mm3, 24(%0)\n" :: "r"(d) );
+}
+
+void x264_idct4x4dc_mmxext( int16_t d[4][4] )
+{
+    /* load DCT */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n" 
+        "movq 24(%0), %%mm3\n" :: "r"(d) );
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 );  /* mm1=s01  mm0=d01  mm3=s23  mm2=d23 */
+    MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 );  /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */
+
+    /* in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 */
+    MMX_TRANSPOSE( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 );  /* mm2=s01  mm3=d01  mm0=s23  mm4=d23 */
+    MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 );  /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */
+
+    /* in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3 */
+    MMX_TRANSPOSE( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
+
+    /* Store back */
+    asm volatile(
+        "movq %%mm0,   (%0)\n"
+        "movq %%mm4,  8(%0)\n"
+        "movq %%mm1, 16(%0)\n" 
+        "movq %%mm3, 24(%0)\n" :: "r"(d) );
+}
+
+/****************************************************************************
+ * subXxX_dct:
+ ****************************************************************************/
+inline void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    /* Reset mm7 */
+    MMX_ZERO( %%mm7 );
+
+    /* Load 4 lines */
+    MMX_LOAD_DIFF_4P( %%mm0, %%mm6, %%mm7, &pix1[0*i_pix1], &pix2[0*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm1, %%mm6, %%mm7, &pix1[1*i_pix1], &pix2[1*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm2, %%mm6, %%mm7, &pix1[2*i_pix1], &pix2[2*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm3, %%mm6, %%mm7, &pix1[3*i_pix1], &pix2[3*i_pix2] );
+
+    MMX_SUMSUB_BADC( %%mm3, %%mm0, %%mm2, %%mm1 );  /* mm3=s03  mm0=d03  mm2=s12  mm1=d12 */
+
+    MMX_SUMSUB_BA(  %%mm2, %%mm3 );                 /* mm2=s03+s12      mm3=s03-s12 */
+    MMX_SUMSUB2_AB( %%mm0, %%mm1, %%mm4 );          /* mm0=2.d03+d12    mm4=d03-2.d12 */
+
+    /* transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 */
+    MMX_TRANSPOSE( %%mm2, %%mm0, %%mm3, %%mm4, %%mm1 );
+
+    MMX_SUMSUB_BADC( %%mm3, %%mm2, %%mm1, %%mm4 );  /* mm3=s03  mm2=d03  mm1=s12  mm4=d12 */
+
+    MMX_SUMSUB_BA(  %%mm1, %%mm3 );                 /* mm1=s03+s12      mm3=s03-s12 */
+    MMX_SUMSUB2_AB( %%mm2, %%mm4, %%mm0 );          /* mm2=2.d03+d12    mm0=d03-2.d12 */
+
+    /* transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3 */
+    MMX_TRANSPOSE( %%mm1, %%mm2, %%mm3, %%mm0, %%mm4 );
+
+    /* Store back */
+    asm volatile(
+        "movq %%mm1, (%0)\n"
+        "movq %%mm0, 8(%0)\n"
+        "movq %%mm4, 16(%0)\n"
+        "movq %%mm3, 24(%0)\n" :: "r"(dct) );
+}
+#endif
+
+void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    x264_sub4x4_dct_mmxext( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+}
+
+void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    x264_sub8x8_dct_mmxext( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+}
+
+
+
+/****************************************************************************
+ * addXxX_idct:
+ ****************************************************************************/
+#if 0
+inline void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+{
+    /* Load dct coeffs */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n" :: "r"(dct) );
+
+    MMX_SUMSUB_BA  ( %%mm2, %%mm0 );                /* mm2=s02  mm0=d02 */
+    MMX_SUMSUBD2_AB( %%mm1, %%mm3, %%mm5, %%mm4 );  /* mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm2, %%mm4, %%mm0 );  /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
+
+    /* in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0 */
+    MMX_TRANSPOSE  ( %%mm1, %%mm4, %%mm0, %%mm2, %%mm3 );
+
+    MMX_SUMSUB_BA  ( %%mm3, %%mm1 );                /* mm3=s02  mm1=d02 */
+    MMX_SUMSUBD2_AB( %%mm2, %%mm0, %%mm5, %%mm4 );  /* mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm4, %%mm1 );  /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
+
+    /* in: mm2, mm4, mm1, mm3  out: mm2, mm3, mm0, mm1 */
+    MMX_TRANSPOSE  ( %%mm2, %%mm4, %%mm1, %%mm3, %%mm0 );
+
+    MMX_ZERO( %%mm7 );
+    asm volatile( "movq x264_mmx_32, %%mm6\n" :: );
+
+    MMX_STORE_DIFF_4P( %%mm2, %%mm4, %%mm6, %%mm7, &p_dst[0*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm3, %%mm4, %%mm6, %%mm7, &p_dst[1*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm0, %%mm4, %%mm6, %%mm7, &p_dst[2*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm1, %%mm4, %%mm6, %%mm7, &p_dst[3*i_dst] );
+}
+#endif
+
+void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+{
+    x264_add4x4_idct_mmxext( p_dst, i_dst,             dct[0] );
+    x264_add4x4_idct_mmxext( &p_dst[4], i_dst,         dct[1] );
+    x264_add4x4_idct_mmxext( &p_dst[4*i_dst+0], i_dst, dct[2] );
+    x264_add4x4_idct_mmxext( &p_dst[4*i_dst+4], i_dst, dct[3] );
+}
+
+void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+{
+    x264_add8x8_idct_mmxext( &p_dst[0], i_dst, &dct[0] );
+    x264_add8x8_idct_mmxext( &p_dst[8], i_dst, &dct[4] );
+    x264_add8x8_idct_mmxext( &p_dst[8*i_dst], i_dst, &dct[8] );
+    x264_add8x8_idct_mmxext( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+}
diff --git a/core/i386/dct.asm b/core/i386/dct.asm
new file mode 100644
index 00000000..054daba7
--- /dev/null
+++ b/core/i386/dct.asm
@@ -0,0 +1,313 @@
+;*****************************************************************************
+;* dct.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
+;*          Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2004.04.28  portab all 4x4 function to nasm (CM)                         *
+;*                                                                           *
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro MMX_ZERO 1
+    pxor    %1, %1
+%endmacro
+
+%macro MMX_LOAD_DIFF_4P 5
+    movd        %1, %4
+    punpcklbw   %1, %3
+    movd        %2, %5
+    punpcklbw   %2, %3
+    psubw       %1, %2
+%endmacro
+
+%macro MMX_SUMSUB_BA 2
+    paddw   %1, %2
+    paddw   %2, %2
+    psubw   %2, %1
+%endmacro
+
+%macro MMX_SUMSUB_BADC 4
+    paddw   %1, %2
+    paddw   %3, %4
+    paddw   %2, %2
+    paddw   %4, %4
+    psubw   %2, %1
+    psubw   %4, %3
+%endmacro
+
+%macro MMX_SUMSUB2_AB 3
+    movq    %3, %1
+    paddw   %1, %1
+    paddw   %1, %2
+    psubw   %3, %2
+    psubw   %3, %2
+%endmacro
+
+%macro MMX_SUMSUBD2_AB 4
+    movq    %4, %1
+    movq    %3, %2
+    psraw   %2, $1
+    psraw   %4, $1
+    paddw   %1, %2
+    psubw   %4, %3
+%endmacro
+
+%macro SBUTTERFLYwd 3
+    movq        %3, %1
+    punpcklwd   %1, %2
+    punpckhwd   %3, %2
+%endmacro
+
+%macro SBUTTERFLYdq 3
+    movq        %3, %1
+    punpckldq   %1, %2
+    punpckhdq   %3, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; input ABCD output ADTC
+;-----------------------------------------------------------------------------
+%macro MMX_TRANSPOSE 5
+    SBUTTERFLYwd %1, %2, %5
+    SBUTTERFLYwd %3, %4, %2
+    SBUTTERFLYdq %1, %3, %4
+    SBUTTERFLYdq %5, %2, %3
+%endmacro
+
+%macro MMX_STORE_DIFF_4P 5
+    paddw       %1, %3
+    psraw       %1, $6
+    movd        %2, %5
+    punpcklbw   %2, %4
+    paddsw      %1, %2
+    packuswb    %1, %1
+    movd        %5, %1
+%endmacro
+
+;%macro 
+;%endmacro
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+;-----------------------------------------------------------------------------
+; Various memory constants (trigonometric values or rounding values)
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_mmx_1:
+  dw 1, 1, 1, 1
+
+x264_mmx_32:
+  dw 32, 32, 32, 32
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_dct4x4dc_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl dct4x4dc( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+x264_dct4x4dc_mmxext:
+    mov     eax,        [esp+ 4]
+    movq    mm0,        [eax+ 0]
+    movq    mm1,        [eax+ 8]
+    movq    mm2,        [eax+16]
+    movq    mm3,        [eax+24]
+
+    MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
+    MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23
+
+    MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
+    MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
+
+    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
+
+    movq    mm6,        [x264_mmx_1]
+    paddw   mm0,        mm6
+    paddw   mm4,        mm6
+    psraw   mm0,        1
+    movq    [eax+ 0],   mm0
+    psraw   mm4,        1
+    movq    [eax+ 8],   mm4
+    paddw   mm1,        mm6
+    paddw   mm3,        mm6
+    psraw   mm1,        1
+    movq    [eax+16],   mm1
+    psraw   mm3,        1
+    movq    [eax+24],   mm3
+    ret
+
+cglobal x264_idct4x4dc_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+x264_idct4x4dc_mmxext:
+    mov     eax, [esp+ 4]
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+    MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
+    MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
+
+    MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
+    MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
+
+    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
+
+    movq    [eax+ 0],   mm0
+    movq    [eax+ 8],   mm4
+    movq    [eax+16],   mm1
+    movq    [eax+24],   mm3
+    ret
+
+cglobal x264_sub4x4_dct_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+;-----------------------------------------------------------------------------
+x264_sub4x4_dct_mmxext:
+    push    ebx
+    mov     eax, [esp+12]   ; pix1
+    mov     ebx, [esp+16]   ; i_pix1
+    mov     ecx, [esp+20]   ; pix2
+    mov     edx, [esp+24]   ; i_pix2
+
+    MMX_ZERO    mm7
+
+    ; Load 4 lines
+    MMX_LOAD_DIFF_4P    mm0, mm6, mm7, [eax      ], [ecx]
+    MMX_LOAD_DIFF_4P    mm1, mm6, mm7, [eax+ebx  ], [ecx+edx]
+    MMX_LOAD_DIFF_4P    mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+    add     eax, ebx
+    add     ecx, edx
+    MMX_LOAD_DIFF_4P    mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+
+    MMX_SUMSUB_BADC     mm3, mm0, mm2, mm1          ; mm3=s03  mm0=d03  mm2=s12  mm1=d12
+
+    MMX_SUMSUB_BA       mm2, mm3                    ; mm2=s03+s12      mm3=s03-s12
+    MMX_SUMSUB2_AB      mm0, mm1, mm4               ; mm0=2.d03+d12    mm4=d03-2.d12
+
+    ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
+    MMX_TRANSPOSE       mm2, mm0, mm3, mm4, mm1
+
+    MMX_SUMSUB_BADC     mm3, mm2, mm1, mm4          ; mm3=s03  mm2=d03  mm1=s12  mm4=d12
+
+    MMX_SUMSUB_BA       mm1, mm3                    ; mm1=s03+s12      mm3=s03-s12
+    MMX_SUMSUB2_AB      mm2, mm4, mm0               ; mm2=2.d03+d12    mm0=d03-2.d12
+
+    ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
+    MMX_TRANSPOSE       mm1, mm2, mm3, mm0, mm4
+
+    mov     eax, [esp+ 8]   ; dct
+    movq    [eax+ 0],   mm1
+    movq    [eax+ 8],   mm0
+    movq    [eax+16],   mm4
+    movq    [eax+24],   mm3
+
+    pop     ebx
+    ret
+
+cglobal x264_add4x4_idct_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+x264_add4x4_idct_mmxext:
+
+    ; Load dct coeffs
+    mov     eax, [esp+12]   ; dct
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+    
+    mov     eax, [esp+ 4]   ; p_dst
+    mov     ecx, [esp+ 8]   ; i_dst
+    lea     edx, [ecx+ecx*2]
+
+    MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
+    MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+    MMX_SUMSUB_BADC     mm1, mm2, mm4, mm0              ; mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13
+
+    ; in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0
+    MMX_TRANSPOSE       mm1, mm4, mm0, mm2, mm3
+
+    MMX_SUMSUB_BA       mm3, mm1                        ; mm3=s02  mm1=d02
+    MMX_SUMSUBD2_AB     mm2, mm0, mm5, mm4              ; mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm4, mm1              ; mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13
+
+    ; in: mm2, mm4, mm1, mm3  out: mm2, mm3, mm0, mm1
+    MMX_TRANSPOSE       mm2, mm4, mm1, mm3, mm0
+
+    MMX_ZERO            mm7
+    movq                mm6, [x264_mmx_32]
+    
+    MMX_STORE_DIFF_4P   mm2, mm4, mm6, mm7, [eax]
+    MMX_STORE_DIFF_4P   mm3, mm4, mm6, mm7, [eax+ecx]
+    MMX_STORE_DIFF_4P   mm0, mm4, mm6, mm7, [eax+ecx*2]
+    MMX_STORE_DIFF_4P   mm1, mm4, mm6, mm7, [eax+edx]
+
+    ret
+
diff --git a/core/i386/dct.h b/core/i386/dct.h
new file mode 100644
index 00000000..23601e5e
--- /dev/null
+++ b/core/i386/dct.h
@@ -0,0 +1,38 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_DCT_H
+#define _I386_DCT_H 1
+
+void x264_sub4x4_dct_mmxext( int16_t dct[4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+
+void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+
+void x264_dct4x4dc_mmxext( int16_t d[4][4] );
+void x264_idct4x4dc_mmxext( int16_t d[4][4] );
+
+#endif
diff --git a/core/i386/mc-c.c b/core/i386/mc-c.c
new file mode 100644
index 00000000..aa7363b1
--- /dev/null
+++ b/core/i386/mc-c.c
@@ -0,0 +1,940 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../mc.h"
+#include "../clip1.h"
+#include "mc.h"
+
+#define UNUSED_UINT64( foo ) \
+    static const uint64_t foo __asm__ (#foo)  __attribute__((unused))
+
+UNUSED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;
+
+
+#define MMX_ZERO( MMZ ) \
+    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
+
+#define MMX_INIT( MMV, NAME ) \
+    asm volatile( "movq " #NAME ", " #MMV "\n" :: )
+
+#define MMX_SAVE_4P( MMP, MMZ, dst ) \
+    asm volatile( "packuswb " #MMZ  "," #MMP "\n" \
+                  "movd " #MMP ", (%0)" :: "r"(dst) )
+
+#define MMX_LOAD_4P( MMP, MMZ, pix ) \
+    asm volatile( "movd (%0), " #MMP "\n" \
+                  "punpcklbw  " #MMZ ", " #MMP "\n" : : "r"(pix) )
+
+#define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
+    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \
+    MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \
+    MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )
+
+#define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\
+    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )
+
+#define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \
+    asm volatile( "packuswb " #MMP2  "," #MMP1 "\n" \
+                  "movq " #MMP1 ", (%0)\n" :: "r"(dst) )
+
+
+#define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \
+    asm volatile( "movq         (%0)   , " #MMP1 "\n" \
+                  "movq       " #MMP1 ", " #MMP2 "\n" \
+                  "punpcklbw  " #MMZ  ", " #MMP1 "\n" \
+                  "punpckhbw  " #MMZ  ", " #MMP2 "\n" : : "r"(pix) )
+
+#define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
+    MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )
+
+#define SBUTTERFLYwd(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpcklwd " #b ", " #a "   \n\t" \
+                  "punpckhwd " #b ", " #t "   \n\t" :: )
+
+#define SBUTTERFLYdq(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpckldq " #b ", " #a "   \n\t" \
+                  "punpckhdq " #b ", " #t "   \n\t" :: )
+
+/* input ABCD output ADTC  ( or 0?31-2->0123 ) */
+#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
+        SBUTTERFLYwd( MMA, MMB, MMT ); \
+        SBUTTERFLYwd( MMC, MMD, MMB ); \
+        SBUTTERFLYdq( MMA, MMC, MMD ); \
+        SBUTTERFLYdq( MMT, MMB, MMC )
+
+/* first pass MM0 = MM0 -5*MM1 */
+#define MMX_FILTERTAP_P1( MMP0, MMP1 ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" :: )
+                                                   \
+/* second pass MM0 = MM0 + 20*(MM2+MM3) */
+#define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \
+    asm volatile( "paddw    " #MMP3 "," #MMP2 "\n" \
+                                                 \
+                  "psllw      $2,     " #MMP2 "\n" \
+                  "paddw    " #MMP2 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP2 "\n" \
+                  "paddw    " #MMP2 "," #MMP0 "\n" :: )
+
+/* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */
+#define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" \
+                                                   \
+                  "paddw    " #MMP2 "," #MMP0 "\n" \
+                  "paddw    " #MMV  "," #MMP0 "\n" \
+                  "psraw      $5,     " #MMP0 "\n" :: )
+
+#define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psubw    " #MMP3 "," #MMP2 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP3 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psubw    " #MMP3 "," #MMP2 "\n" :: )
+
+/* second pass MM0 = MM0 + 20*(MM1+MM2) */
+#define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \
+    asm volatile( "paddw    " #MMP2 "," #MMP1 "\n" \
+                  "paddw    " #MMP5 "," #MMP4 "\n" \
+                                                 \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP4 "\n" \
+                  "paddw    " #MMP1 "," #MMP0 "\n" \
+                  "paddw    " #MMP4 "," #MMP3 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP4 "\n" \
+                  "paddw    " #MMP1 "," #MMP0 "\n" \
+                  "paddw    " #MMP4 "," #MMP3 "\n" :: )
+
+#define MMX_LOAD_1r( m1, dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \
+
+#define MMX_SAVE_1r( m1, dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \
+
+#define MMX_LOAD_2r( m1, m2, dst, i_dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
+
+#define MMX_SAVE_2r( m1, m2, dst, i_dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
+
+#define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
+    asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
+    asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
+
+#define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
+
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
+}
+
+static inline void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
+                                 uint8_t *src1, int i_src1_stride,
+                                 uint8_t *src2, int i_src2_stride,
+                                 int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+static inline void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
+                                 uint8_t *src1, int i_src1_stride,
+                                 uint8_t *src2, int i_src2_stride,
+                                 int i_height )
+{
+    int y;
+    for( y = 0; y < i_height; y++ )
+    {
+        asm volatile(
+            "movq (%1), %%mm0\n"
+            "movq (%2), %%mm1\n"
+            "pavgb %%mm1, %%mm0\n"
+            "movq %%mm0, (%0)\n"
+            : : "r"(dst), "r"(src1), "r"(src2)
+            );
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+static inline void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
+                                  uint8_t *src1, int i_src1_stride,
+                                  uint8_t *src2, int i_src2_stride,
+                                  int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        asm volatile(
+            "movq (%1), %%mm0\n"
+            "movq 8(%1), %%mm2\n"
+            "movq (%2), %%mm1\n"
+            "movq 8(%2), %%mm3\n"
+
+            "pavgb %%mm1, %%mm0\n"
+            "movq %%mm0, (%0)\n"
+            "pavgb %%mm3, %%mm2\n"
+            "movq %%mm2, 8(%0)\n"
+            : : "r"(dst), "r"(src1), "r"(src2)
+            );
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+
+typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+
+/*****************************************************************************
+ * MC with width == 4 (height <= 8)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 4 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+
+static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    const int h4 = i_height / 4;
+    uint8_t  srct[4*8*3];
+    uint64_t tmp[4];
+    int y;
+
+    src -= 2;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < h4; y++ )
+    {
+        int i;
+
+        /* Preload data and transpose them */
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 );
+
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 );
+
+        /* we read 2 more bytes that needed */
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 );
+
+        /* tap filter */
+        for( i = 0; i < 4; i++ )
+        {
+            MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 );
+            MMX_FILTERTAP_P1( %%mm0, %%mm1 );
+            MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
+
+            MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 );
+            MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
+
+            MMX_SAVE_1r( %%mm0, &tmp[i] );
+        }
+
+        MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] );
+        MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] );
+        MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] );
+        MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] );
+
+        src += 4 * i_src;
+        dst += 4 * i_dst;
+    }
+}
+static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    int y;
+
+    src -= 2 * i_src;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src );
+        MMX_FILTERTAP_P1( %%mm0, %%mm1 );
+        MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
+
+        MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src );
+        MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 );
+        MMX_SAVE_4P( %%mm0, %%mm7, dst );
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int i, x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int16_t tap[5+4];
+
+        for( i = 0; i < 5+4; i++ )
+        {
+            tap[i] = x264_tapfilter( &src[-2+i], i_src_stride );
+        }
+
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
+        }
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+
+/* mc I+H */
+static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
+}
+static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
+}
+static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height );
+}
+/* H+V */
+static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy31_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src,   i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy13_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src,              i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src+1,            i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hv_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src,   i_src_stride, tmp1, 4, i_height );
+    mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src,              i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+
+
+/*****************************************************************************
+ * MC with width == 8 (height <= 16)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 8 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+
+static inline void mc_hh_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w4( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hh_w4( &src[4], i_src, &dst[4], i_dst, i_height );
+}
+static inline void mc_hv_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    int y;
+
+    src -= 2 * i_src;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        MMX_LOAD_2x8( %%mm0, %%mm5, %%mm1, %%mm2, %%mm7,  &src[0*i_src], i_src );
+        MMX_FILTERTAP2_P1( %%mm0, %%mm1, %%mm5, %%mm2 );
+
+
+        MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[2*i_src], i_src );
+        MMX_FILTERTAP2_P2( %%mm0, %%mm1, %%mm2, %%mm5, %%mm3, %%mm4 );
+
+        MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[4*i_src], i_src );
+        MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
+        MMX_FILTERTAP_P3( %%mm5, %%mm3, %%mm4, %%mm6, %%mm7 );
+
+        MMX_SAVEPACK_8P( %%mm0, %%mm5, %%mm7, dst );
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int x, y;
+
+    asm volatile( "pxor %%mm7,        %%mm7\n" : : );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int16_t tap[5+8];
+
+        /* first 8 */
+        asm volatile(
+            "leal   (%0, %1),   %%eax\n"
+
+            "movq       (%0),   %%mm0\n"    /* load pix-2 */
+            "movq       %%mm0,  %%mm2\n"
+            "punpcklbw  %%mm7,  %%mm0\n"
+            "punpckhbw  %%mm7,  %%mm2\n"
+
+            "movq       (%%eax),%%mm1\n"    /* load pix-1 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psubw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "psubw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1),%%mm1\n"  /* load pix */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       (%0,%1,4),%%mm1\n"  /* load pix+2 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psubw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "psubw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       %%mm0,   (%2)\n"
+            "movq       %%mm2,  8(%2)\n"
+
+
+            "addl   $8,         %%eax\n"
+            "addl   $8,         %0\n"
+
+
+            "movd       (%0),   %%mm0\n"    /* load pix-2 */
+            "punpcklbw  %%mm7,  %%mm0\n"
+
+            "movd       (%%eax),%%mm1\n"    /* load pix-1 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1),%%mm1\n"  /* load pix */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movd       (%0,%1,4),%%mm1\n"  /* load pix+2 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movq       %%mm0,  16(%2)\n"
+            : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" );
+
+        /* last one */
+        tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
+        }
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+
+/* mc I+H */
+static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
+}
+static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
+}
+static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height );
+}
+/* H+V */
+static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src,   i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src,              i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src+1,            i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hv_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src,   i_src_stride, tmp1, 8, i_height );
+    mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src,              i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+
+
+/*****************************************************************************
+ * MC with width == 16 (height <= 16)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 16 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+static inline void mc_hh_w16( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w4( &src[ 0], i_src, &dst[ 0], i_dst, i_height );
+    mc_hh_w4( &src[ 4], i_src, &dst[ 4], i_dst, i_height );
+    mc_hh_w4( &src[ 8], i_src, &dst[ 8], i_dst, i_height );
+    mc_hh_w4( &src[12], i_src, &dst[12], i_dst, i_height );
+}
+static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    mc_hv_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
+    mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
+}
+
+static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    mc_hc_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
+    mc_hc_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
+}
+
+/* mc I+H */
+static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
+}
+static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
+}
+static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
+}
+/* H+V */
+static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src,   i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src,              i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src+1,            i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src,   i_src_stride, tmp1, 16, i_height );
+    mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src,              i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src_stride,
+                                      uint8_t *dst, int i_dst_stride,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
+    {
+        {
+            { mc_copy_w4,  mc_xy10_w4,    mc_hh_w4,      mc_xy30_w4 },
+            { mc_xy01_w4,  mc_xy11_w4,    mc_xy21_w4,    mc_xy31_w4 },
+            { mc_hv_w4,    mc_xy12_w4,    mc_hc_w4,      mc_xy32_w4 },
+            { mc_xy03_w4,  mc_xy13_w4,    mc_xy23_w4,    mc_xy33_w4 },
+        },
+        {
+            { mc_copy_w8,  mc_xy10_w8,    mc_hh_w8,      mc_xy30_w8 },
+            { mc_xy01_w8,  mc_xy11_w8,    mc_xy21_w8,    mc_xy31_w8 },
+            { mc_hv_w8,    mc_xy12_w8,    mc_hc_w8,      mc_xy32_w8 },
+            { mc_xy03_w8,  mc_xy13_w8,    mc_xy23_w8,    mc_xy33_w8 },
+        },
+        {
+            { mc_copy_w16,  mc_xy10_w16,    mc_hh_w16,      mc_xy30_w16 },
+            { mc_xy01_w16,  mc_xy11_w16,    mc_xy21_w16,    mc_xy31_w16 },
+            { mc_hv_w16,    mc_xy12_w16,    mc_hc_w16,      mc_xy32_w16 },
+            { mc_xy03_w16,  mc_xy13_w16,    mc_xy23_w16,    mc_xy33_w16 },
+        }
+    };
+
+    src += (mvy >> 2) * i_src_stride + (mvx >> 2);
+    if( i_width == 4 )
+    {
+        pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else if( i_width == 8 )
+    {
+        pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else if( i_width == 16 )
+    {
+        pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else
+    {
+        fprintf( stderr, "Error: motion_compensation_luma called with invalid width" );
+    }
+}
+
+void x264_mc_mmxext_init( x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA]   = motion_compensation_luma;
+}
+
diff --git a/core/i386/mc.asm b/core/i386/mc.asm
new file mode 100644
index 00000000..0210a6b3
--- /dev/null
+++ b/core/i386/mc.asm
@@ -0,0 +1,187 @@
+;*****************************************************************************
+;* mc.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: mc.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
+;*          Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2004.05.17 portab mc_copy_w4/8/16 (CM)                                   *
+;*                                                                           *
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+;-----------------------------------------------------------------------------
+; Various memory constants (trigonometric values or rounding values)
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal mc_copy_w4
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w4:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    mov     eax, [esi]
+    mov     [edi], eax
+    mov     eax, [esi+ebx]
+    mov     [edi+edx], eax
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    dec     ecx
+    dec     ecx
+    jne     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+cglobal mc_copy_w8
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w8:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    movq    mm0, [esi]
+    movq    [edi], mm0
+    movq    mm1, [esi+ebx]
+    movq    [edi+edx], mm1
+    movq    mm2, [esi+ebx*2]
+    movq    [edi+edx*2], mm2
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    movq    mm3, [esi+ebx]
+    movq    [edi+edx], mm3
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    
+    sub     ecx, byte 4
+    jnz     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+cglobal mc_copy_w16
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w16:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    movq    mm0, [esi]
+    movq    mm1, [esi+8]
+    movq    [edi], mm0
+    movq    [edi+8], mm1
+    movq    mm2, [esi+ebx]
+    movq    mm3, [esi+ebx+8]
+    movq    [edi+edx], mm2
+    movq    [edi+edx+8], mm3
+    movq    mm4, [esi+ebx*2]
+    movq    mm5, [esi+ebx*2+8]
+    movq    [edi+edx*2], mm4
+    movq    [edi+edx*2+8], mm5
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    movq    mm6, [esi+ebx]
+    movq    mm7, [esi+ebx+8]
+    movq    [edi+edx], mm6
+    movq    [edi+edx+8], mm7
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    
+    sub     ecx, byte 4
+    jnz     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
diff --git a/core/i386/mc.h b/core/i386/mc.h
new file mode 100644
index 00000000..c3e906fc
--- /dev/null
+++ b/core/i386/mc.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_MC_H
+#define _I386_MC_H 1
+
+void x264_mc_mmxext_init( x264_mc_function_t pf[2] );
+
+#endif
diff --git a/core/i386/pixel.asm b/core/i386/pixel.asm
new file mode 100644
index 00000000..14015741
--- /dev/null
+++ b/core/i386/pixel.asm
@@ -0,0 +1,705 @@
+;*****************************************************************************
+;* pixel.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro SAD_INC_2x16P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+8]
+    movq    mm4,    [ecx+8]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    movq    mm1,    [eax+ebx]
+    movq    mm2,    [ecx+edx]
+    movq    mm3,    [eax+ebx+8]
+    movq    mm4,    [ecx+edx+8]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SAD_INC_2x8P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+ebx]
+    movq    mm4,    [ecx+edx]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SAD_INC_2x4P 0
+    movd    mm1,    [eax]
+    movd    mm2,    [ecx]
+    movd    mm3,    [eax+ebx]
+    movd    mm4,    [ecx+edx]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro LOAD_DIFF_4P 5  ; MMP, MMT, MMZ, [pix1], [pix2]
+    movd        %1, %4
+    punpcklbw   %1, %3
+    movd        %2, %5
+    punpcklbw   %2, %3
+    psubw       %1, %2
+%endmacro
+
+%macro LOAD_DIFF_INC_4x4 11 ; p1,p2,p3,p4, t, z, pix1, i_pix1, pix2, i_pix2, offset
+    LOAD_DIFF_4P %1, %5, %6, [%7+%11],    [%9+%11]
+    LOAD_DIFF_4P %2, %5, %6, [%7+%8+%11], [%9+%10+%11]
+    lea %7, [%7+2*%8]
+    lea %9, [%9+2*%10]
+    LOAD_DIFF_4P %3, %5, %6, [%7+%11],    [%9+%11]
+    LOAD_DIFF_4P %4, %5, %6, [%7+%8+%11], [%9+%10+%11]
+    lea %7, [%7+2*%8]
+    lea %9, [%9+2*%10]
+%endmacro
+
+%macro HADAMARD4_SUB_BADC 4
+    paddw %1,   %2
+    paddw %3,   %4
+    paddw %2,   %2
+    paddw %4,   %4
+    psubw %2,   %1
+    psubw %4,   %3
+%endmacro
+
+%macro HADAMARD4x4 4
+    HADAMARD4_SUB_BADC %1, %2, %3, %4
+    HADAMARD4_SUB_BADC %1, %3, %2, %4
+%endmacro
+
+%macro SBUTTERFLYwd 3
+    movq        %3, %1
+    punpcklwd   %1, %2
+    punpckhwd   %3, %2
+%endmacro
+
+%macro SBUTTERFLYdq 3
+    movq        %3, %1
+    punpckldq   %1, %2
+    punpckhdq   %3, %2
+%endmacro
+
+%macro TRANSPOSE4x4 5   ; abcd-t -> adtc
+    SBUTTERFLYwd %1, %2, %5
+    SBUTTERFLYwd %3, %4, %2
+    SBUTTERFLYdq %1, %3, %4
+    SBUTTERFLYdq %5, %2, %3
+%endmacro
+
+%macro MMX_ABS 2        ; mma, mmt
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+%endmacro
+
+%macro MMX_ABS_SUM 3    ; mma, mmt, mms
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+    paddusw %3, %1
+%endmacro
+
+
+%macro MMX_SUM_MM 2     ; mmv, mmt
+    movq    %2, %1
+    psrlq   %1, 32
+    paddusw %1, %2
+    movq    %2, %1
+    psrlq   %1, 16
+    paddusw %1, %2
+    movd    eax,%1
+    and     eax,0xffff
+    shr     eax,1
+%endmacro
+
+%macro HADAMARD4x4_FIRST 0
+    HADAMARD4x4 mm0, mm1, mm2, mm3
+    TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
+    HADAMARD4x4 mm0, mm3, mm4, mm2
+    MMX_ABS     mm0, mm7
+    MMX_ABS_SUM mm3, mm7, mm0
+    MMX_ABS_SUM mm4, mm7, mm0
+    MMX_ABS_SUM mm2, mm7, mm0
+%endmacro
+
+%macro HADAMARD4x4_NEXT 0
+    HADAMARD4x4 mm1, mm2, mm3, mm4
+    TRANSPOSE4x4 mm1, mm2, mm3, mm4, mm5
+    HADAMARD4x4 mm1, mm4, mm5, mm3
+    MMX_ABS_SUM mm1, mm7, mm0
+    MMX_ABS_SUM mm4, mm7, mm0
+    MMX_ABS_SUM mm5, mm7, mm0
+    MMX_ABS_SUM mm3, mm7, mm0
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_pixel_sad_16x16_mmxext
+cglobal x264_pixel_sad_16x8_mmxext
+cglobal x264_pixel_sad_8x16_mmxext
+cglobal x264_pixel_sad_8x8_mmxext
+cglobal x264_pixel_sad_8x4_mmxext
+cglobal x264_pixel_sad_4x8_mmxext
+cglobal x264_pixel_sad_4x4_mmxext
+
+cglobal x264_pixel_satd_4x4_mmxext
+cglobal x264_pixel_satd_4x8_mmxext
+cglobal x264_pixel_satd_8x4_mmxext
+cglobal x264_pixel_satd_8x8_mmxext
+cglobal x264_pixel_satd_16x8_mmxext
+cglobal x264_pixel_satd_8x16_mmxext
+cglobal x264_pixel_satd_16x16_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x16_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x16_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_4x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_4x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_4x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_4x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_4x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+
+    LOAD_DIFF_4P mm0, mm6, mm7, [eax],       [ecx]
+    LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx],   [ecx+edx]
+    LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
+    add eax, ebx
+    add ecx, edx
+    LOAD_DIFF_4P mm3, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
+
+    HADAMARD4x4_FIRST
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_4x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ecx,    [esp+16]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ecx,    [esp+16]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x8_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add         eax, ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x16_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     eax,    ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x16_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     ebp,    eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     ebp,    eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     eax,    ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
diff --git a/core/i386/pixel.h b/core/i386/pixel.h
new file mode 100644
index 00000000..799cbfde
--- /dev/null
+++ b/core/i386/pixel.h
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_PIXEL_H
+#define _I386_PIXEL_H 1
+
+int x264_pixel_sad_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+#endif
diff --git a/core/i386/predict.c b/core/i386/predict.c
new file mode 100644
index 00000000..587416bd
--- /dev/null
+++ b/core/i386/predict.c
@@ -0,0 +1,429 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* XXX predict4x4 are inspired from ffmpeg h264 decoder
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"   /* for keyword inline */
+#include "../predict.h"
+#include "predict.h"
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/****************************************************************************
+ * 16x16 prediction for intra block DC, H, V, P
+ ****************************************************************************/
+static void predict_16x16_dc( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    /* calculate DC value */
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+        dc += src[i - i_stride];
+    }
+    dc = (( dc + 16 ) >> 5) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_left( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+    }
+    dc = (( dc + 8 ) >> 4) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_top( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[i - i_stride];
+    }
+    dc = (( dc + 8 ) >> 4) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_128( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        const uint32_t v = 0x01010101 * src[-1];
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = v;
+        *p++ = v;
+        *p++ = v;
+        *p++ = v;
+
+        src += i_stride;
+
+    }
+}
+static void predict_16x16_v( uint8_t *src, int i_stride )
+{
+    int i;
+
+    asm volatile(
+        "movq  (%0), %%mm0\n"
+        "movq 8(%0), %%mm1\n" :: "r"(&src[-i_stride]) );
+
+    for( i = 0; i < 16; i++ )
+    {
+        asm volatile(
+            "movq %%mm0,  (%0)\n"
+            "movq %%mm1, 8(%0)\n" :: "r"(src) );
+        src += i_stride;
+    }
+}
+
+/****************************************************************************
+ * 8x8 prediction for intra chroma block DC, H, V, P
+ ****************************************************************************/
+static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+{
+    int y;
+
+    for( y = 0; y < 8; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc0 = 0, dc1 = 0;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dc0 += src[y * i_stride     - 1];
+        dc1 += src[(y+4) * i_stride - 1];
+    }
+    dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
+    dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc0;
+
+        src += i_stride;
+    }
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc1;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+
+}
+static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+{
+    int y, x;
+    uint32_t dc0 = 0, dc1 = 0;
+
+    for( x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - i_stride];
+        dc1 += src[x + 4 - i_stride];
+    }
+    dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
+    dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 8; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc( uint8_t *src, int i_stride )
+{
+    int y;
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    uint32_t dc0, dc1, dc2, dc3;
+    int i;
+
+    /* First do :
+          s0 s1
+       s2
+       s3
+    */
+    for( i = 0; i < 4; i++ )
+    {
+        s0 += src[i - i_stride];
+        s1 += src[i + 4 - i_stride];
+        s2 += src[-1 + i * i_stride];
+        s3 += src[-1 + (i+4)*i_stride];
+    }
+    /* now calculate
+       dc0 dc1
+       dc2 dc3
+     */
+    dc0 = (( s0 + s2 + 4 ) >> 3)*0x01010101;
+    dc1 = (( s1 + 2 ) >> 2)*0x01010101;
+    dc2 = (( s3 + 2 ) >> 2)*0x01010101;
+    dc3 = (( s1 + s3 + 4 ) >> 3)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc2;
+        *p++ = dc3;
+
+        src += i_stride;
+    }
+}
+
+static void predict_8x8_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 8; i++ )
+    {
+        uint32_t v = 0x01010101 * src[-1];
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = v;
+        *p++ = v;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_v( uint8_t *src, int i_stride )
+{
+    int i;
+
+    asm volatile( "movq  (%0), %%mm0\n" :: "r"(&src[-i_stride]) );
+
+    for( i = 0; i < 8; i++ )
+    {
+        asm volatile( "movq %%mm0,  (%0)\n" :: "r"(src) );
+        src += i_stride;
+    }
+}
+
+
+/****************************************************************************
+ * 4x4 prediction for intra luma block DC, H, V, P
+ ****************************************************************************/
+static void predict_4x4_dc_128( uint8_t *src, int i_stride )
+{
+    int y;
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
+                     src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_top( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[0 - i_stride] + src[1 - i_stride] +
+                     src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
+                     src[-1+2*i_stride] + src[-1+3*i_stride] +
+                     src[0 - i_stride]  + src[1 - i_stride] +
+                     src[2 - i_stride]  + src[3 - i_stride] + 4 ) >> 3)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = 0x01010101*src[-1];
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_v( uint8_t *src, int i_stride )
+{
+    uint32_t top = *((uint32_t*)&src[-i_stride]);
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p = top;
+
+        src += i_stride;
+    }
+}
+
+/****************************************************************************
+ * Exported functions:
+ ****************************************************************************/
+void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
+{
+    pf[I_PRED_16x16_V ]     = predict_16x16_v;
+    pf[I_PRED_16x16_H ]     = predict_16x16_h;
+    pf[I_PRED_16x16_DC]     = predict_16x16_dc;
+    pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
+    pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top;
+    pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128;
+}
+
+void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = predict_8x8_v;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+}
+
+void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] )
+{
+    pf[I_PRED_4x4_V]      = predict_4x4_v;
+    pf[I_PRED_4x4_H]      = predict_4x4_h;
+    pf[I_PRED_4x4_DC]     = predict_4x4_dc;
+    pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left;
+    pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top;
+    pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128;
+}
+
diff --git a/core/i386/predict.h b/core/i386/predict.h
new file mode 100644
index 00000000..b00b1e59
--- /dev/null
+++ b/core/i386/predict.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * predict.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_PREDICT_H
+#define _I386_PREDICT_H 1
+
+void x264_predict_16x16_init_mmxext ( x264_predict_t pf[7] );
+void x264_predict_8x8_init_mmxext   ( x264_predict_t pf[7] );
+void x264_predict_4x4_init_mmxext   ( x264_predict_t pf[12] );
+
+#endif
diff --git a/core/macroblock.c b/core/macroblock.c
new file mode 100644
index 00000000..59603f03
--- /dev/null
+++ b/core/macroblock.c
@@ -0,0 +1,1029 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "common.h"
+#include "macroblock.h"
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int dequant_mf[6][4][4] =
+{
+    { {10, 13, 10, 13}, {13, 16, 13, 16}, {10, 13, 10, 13}, {13, 16, 13, 16} },
+    { {11, 14, 11, 14}, {14, 18, 14, 18}, {11, 14, 11, 14}, {14, 18, 14, 18} },
+    { {13, 16, 13, 16}, {16, 20, 16, 20}, {13, 16, 13, 16}, {16, 20, 16, 20} },
+    { {14, 18, 14, 18}, {18, 23, 18, 23}, {14, 18, 14, 18}, {18, 23, 18, 23} },
+    { {16, 20, 16, 20}, {20, 25, 20, 25}, {16, 20, 16, 20}, {20, 25, 20, 25} },
+    { {18, 23, 18, 23}, {23, 29, 23, 29}, {18, 23, 18, 23}, {23, 29, 23, 29} }
+};
+
+#if 0
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+#endif
+
+int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
+{
+    const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
+    const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
+    const int m  = X264_MIN( ma, mb );
+
+    if( m < 0 )
+        return I_PRED_4x4_DC;
+
+    return m;
+}
+
+int x264_mb_predict_non_zero_code( x264_t *h, int idx )
+{
+    const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
+    const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];
+
+    int i_ret = za + zb;
+
+    if( i_ret < 0x80 )
+    {
+        i_ret = ( i_ret + 1 ) >> 1;
+    }
+    return i_ret & 0x7f;
+}
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int i_qscale )
+{
+    const int i_qbits = i_qscale/6 - 1;
+
+    if( i_qbits >= 0 )
+    {
+        const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
+
+        dct[0][0] = dct[0][0] * i_dmf;
+        dct[0][1] = dct[0][1] * i_dmf;
+        dct[1][0] = dct[1][0] * i_dmf;
+        dct[1][1] = dct[1][1] * i_dmf;
+    }
+    else
+    {
+        const int i_dmf = dequant_mf[i_qscale%6][0][0];
+
+        dct[0][0] = ( dct[0][0] * i_dmf ) >> 1;
+        dct[0][1] = ( dct[0][1] * i_dmf ) >> 1;
+        dct[1][0] = ( dct[1][0] * i_dmf ) >> 1;
+        dct[1][1] = ( dct[1][1] * i_dmf ) >> 1;
+    }
+}
+
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = i_qscale/6 - 2;
+    int x,y;
+
+    if( i_qbits >= 0 )
+    {
+        const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
+
+        for( y = 0; y < 4; y++ )
+        {
+            for( x = 0; x < 4; x++ )
+            {
+                dct[y][x] = dct[y][x] * i_dmf;
+            }
+        }
+    }
+    else
+    {
+        const int i_dmf = dequant_mf[i_qscale%6][0][0];
+        const int f = 1 << ( 1 + i_qbits );
+
+        for( y = 0; y < 4; y++ )
+        {
+            for( x = 0; x < 4; x++ )
+            {
+                dct[y][x] = ( dct[y][x] * i_dmf + f ) >> (-i_qbits);
+            }
+        }
+    }
+}
+
+void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale )
+{
+    const int i_mf = i_qscale%6;
+    const int i_qbits = i_qscale/6;
+    int y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] ) << i_qbits;
+        dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] ) << i_qbits;
+        dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] ) << i_qbits;
+        dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] ) << i_qbits;
+    }
+}
+
+static inline int x264_median( int a, int b, int c )
+{
+    int min = a, max =a;
+    if( b < min )
+        min = b;
+    else
+        max = b;    /* no need to do 'b > max' (more consuming than always doing affectation) */
+
+    if( c < min )
+        min = c;
+    else if( c > max )
+        max = c;
+
+    return a + b + c - min - max;
+}
+
+void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] )
+{
+    const int i8 = x264_scan8[idx];
+    const int i_ref= h->mb.cache.ref[i_list][i8];
+    int     i_refa = h->mb.cache.ref[i_list][i8 - 1];
+    int16_t *mv_a  = h->mb.cache.mv[i_list][i8 - 1];
+    int     i_refb = h->mb.cache.ref[i_list][i8 - 8];
+    int16_t *mv_b  = h->mb.cache.mv[i_list][i8 - 8];
+    int     i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width ];
+    int16_t *mv_c  = h->mb.cache.mv[i_list][i8 - 8 + i_width];
+
+    int i_count;
+
+    if( (idx&0x03) == 3 || ( i_width == 2 && (idx&0x3) == 2 )|| i_refc == -2 )
+    {
+        i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
+        mv_c   = h->mb.cache.mv[i_list][i8 - 8 - 1];
+    }
+
+    if( h->mb.i_partition == D_16x8 )
+    {
+        if( idx == 0 && i_refb == i_ref )
+        {
+            mvp[0] = mv_b[0];
+            mvp[1] = mv_b[1];
+            return;
+        }
+        else if( idx != 0 && i_refa == i_ref )
+        {
+            mvp[0] = mv_a[0];
+            mvp[1] = mv_a[1];
+            return;
+        }
+    }
+    else if( h->mb.i_partition == D_8x16 )
+    {
+        if( idx == 0 && i_refa == i_ref )
+        {
+            mvp[0] = mv_a[0];
+            mvp[1] = mv_a[1];
+            return;
+        }
+        else if( idx != 0 && i_refc == i_ref )
+        {
+            mvp[0] = mv_c[0];
+            mvp[1] = mv_c[1];
+            return;
+        }
+    }
+
+    i_count = 0;
+    if( i_refa == i_ref ) i_count++;
+    if( i_refb == i_ref ) i_count++;
+    if( i_refc == i_ref ) i_count++;
+
+    if( i_count > 1 )
+    {
+        mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
+        mvp[1] = x264_median( mv_a[1], mv_b[1], mv_c[1] );
+    }
+    else if( i_count == 1 )
+    {
+        if( i_refa == i_ref )
+        {
+            mvp[0] = mv_a[0];
+            mvp[1] = mv_a[1];
+        }
+        else if( i_refb == i_ref )
+        {
+            mvp[0] = mv_b[0];
+            mvp[1] = mv_b[1];
+        }
+        else
+        {
+            mvp[0] = mv_c[0];
+            mvp[1] = mv_c[1];
+        }
+    }
+    else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
+    {
+        mvp[0] = mv_a[0];
+        mvp[1] = mv_a[1];
+    }
+    else
+    {
+        mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
+        mvp[1] = x264_median( mv_a[1], mv_b[1], mv_c[1] );
+    }
+}
+
+void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] )
+{
+    int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
+    int16_t *mv_a  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
+    int     i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
+    int16_t *mv_b  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
+    int     i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
+    int16_t *mv_c  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
+
+    int i_count;
+
+    if( i_refc == -2 )
+    {
+        i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
+        mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
+    }
+
+    i_count = 0;
+    if( i_refa == i_ref ) i_count++;
+    if( i_refb == i_ref ) i_count++;
+    if( i_refc == i_ref ) i_count++;
+
+    if( i_count > 1 )
+    {
+        mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
+        mvp[1] = x264_median( mv_a[1], mv_b[1], mv_c[1] );
+    }
+    else if( i_count == 1 )
+    {
+        if( i_refa == i_ref )
+        {
+            mvp[0] = mv_a[0];
+            mvp[1] = mv_a[1];
+        }
+        else if( i_refb == i_ref )
+        {
+            mvp[0] = mv_b[0];
+            mvp[1] = mv_b[1];
+        }
+        else
+        {
+            mvp[0] = mv_c[0];
+            mvp[1] = mv_c[1];
+        }
+    }
+    else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
+    {
+        mvp[0] = mv_a[0];
+        mvp[1] = mv_a[1];
+    }
+    else
+    {
+        mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
+        mvp[1] = x264_median( mv_a[1], mv_b[1], mv_c[1] );
+    }
+}
+
+
+void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] )
+{
+    int     i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1];
+    int     i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8];
+    int16_t *mv_a  = h->mb.cache.mv[0][X264_SCAN8_0 - 1];
+    int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
+
+    if( i_refa == -2 || i_refb == -2 ||
+        ( i_refa == 0 && mv_a[0] == 0 && mv_a[1] == 0 ) ||
+        ( i_refb == 0 && mv_b[0] == 0 && mv_b[1] == 0 ) )
+    {
+        mv[0] = mv[1] = 0;
+    }
+    else
+    {
+        x264_mb_predict_mv_16x16( h, 0, 0, mv );
+    }
+}
+
+static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
+{
+    const int i8 = x264_scan8[0]+x+8*y;
+    const int i_ref = h->mb.cache.ref[0][i8];
+    const int mvx   = h->mb.cache.mv[0][i8][0];
+    const int mvy   = h->mb.cache.mv[0][i8][1];
+
+    h->mc[MC_LUMA]( &h->mb.pic.p_fref[0][i_ref][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    &h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x],           h->mb.pic.i_stride[0],
+                    mvx, mvy, 4*width, 4*height );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                      &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x],           h->mb.pic.i_stride[1],
+                      mvx, mvy, 2*width, 2*height );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                      &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x],           h->mb.pic.i_stride[2],
+                      mvx, mvy, 2*width, 2*height );
+}
+static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
+{
+    const int i8 = x264_scan8[0]+x+8*y;
+    const int i_ref = h->mb.cache.ref[1][i8];
+    const int mvx   = h->mb.cache.mv[1][i8][0];
+    const int mvy   = h->mb.cache.mv[1][i8][1];
+
+    h->mc[MC_LUMA]( &h->mb.pic.p_fref[1][i_ref][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x],            h->mb.pic.i_stride[0],
+                    mvx, mvy, 4*width, 4*height );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                      &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x],           h->mb.pic.i_stride[1],
+                      mvx, mvy, 2*width, 2*height );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                      &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x],           h->mb.pic.i_stride[2],
+                      mvx, mvy, 2*width, 2*height );
+}
+
+static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
+{
+    const int i8 = x264_scan8[0]+x+8*y;
+
+    const int i_ref0 = h->mb.cache.ref[0][i8];
+    const int mvx0   = h->mb.cache.mv[0][i8][0];
+    const int mvy0   = h->mb.cache.mv[0][i8][1];
+
+    const int i_ref1 = h->mb.cache.ref[1][i8];
+    const int mvx1   = h->mb.cache.mv[1][i8][0];
+    const int mvy1   = h->mb.cache.mv[1][i8][1];
+    DECLARE_ALIGNED( uint8_t, tmp[16*16], 16 );
+    int     i_mode = 0;
+
+    if( width == 4 && height == 4 ) i_mode = PIXEL_16x16;
+    else if( width == 4 && height == 2 ) i_mode = PIXEL_16x8;
+    else if( width == 2 && height == 4 ) i_mode = PIXEL_8x16;
+    else if( width == 2 && height == 2 ) i_mode = PIXEL_8x8;
+    else if( width == 2 && height == 1 ) i_mode = PIXEL_8x4;
+    else if( width == 1 && height == 2 ) i_mode = PIXEL_4x8;
+    else if( width == 1 && height == 1 ) i_mode = PIXEL_4x4;
+
+    h->mc[MC_LUMA]( &h->mb.pic.p_fref[0][i_ref0][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x],             h->mb.pic.i_stride[0],
+                    mvx0, mvy0, 4*width, 4*height );
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref0][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                      &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x],            h->mb.pic.i_stride[1],
+                      mvx0, mvy0, 2*width, 2*height );
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref0][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                      &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x],            h->mb.pic.i_stride[2],
+                      mvx0, mvy0, 2*width, 2*height );
+
+
+    h->mc[MC_LUMA]( &h->mb.pic.p_fref[1][i_ref1][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    tmp, 16, mvx1, mvy1, 4*width, 4*height );
+    h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0], tmp, 16 );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref1][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                      tmp, 16, mvx1, mvy1, 2*width, 2*height );
+    h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], tmp, 16 );
+
+    h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref1][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                      tmp, 16, mvx1, mvy1, 2*width, 2*height );
+    h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], tmp, 16 );
+}
+
+
+void x264_mb_mc( x264_t *h )
+{
+    if( h->mb.i_type == P_L0 )
+    {
+        if( h->mb.i_partition == D_16x16 )
+        {
+            x264_mb_mc_0xywh( h, 0, 0, 4, 4 );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            x264_mb_mc_0xywh( h, 0, 0, 4, 2 );
+            x264_mb_mc_0xywh( h, 0, 2, 4, 2 );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            x264_mb_mc_0xywh( h, 0, 0, 2, 4 );
+            x264_mb_mc_0xywh( h, 2, 0, 2, 4 );
+        }
+    }
+    else if( h->mb.i_type == P_8x8 )
+    {
+        int i;
+        for( i = 0; i < 4; i++ )
+        {
+            const int x = 2*(i%2);
+            const int y = 2*(i/2);
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    x264_mb_mc_0xywh( h, x, y, 2, 2 );
+                    break;
+                case D_L0_8x4:
+                    x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
+                    x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
+                    break;
+                case D_L0_4x8:
+                    x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
+                    x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
+                    break;
+                case D_L0_4x4:
+                    x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
+                    x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
+                    x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
+                    x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
+                    break;
+            }
+        }
+    }
+    else if( h->mb.i_type == B_8x8 || h->mb.i_type == B_DIRECT )
+    {
+        fprintf( stderr, "mc_luma with unsupported mb\n" );
+        return;
+    }
+    else    /* B_*x* */
+    {
+        int b_list0[2];
+        int b_list1[2];
+
+        int i;
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list0[i] = x264_mb_type_list0_table[h->mb.i_type][i];
+            b_list1[i] = x264_mb_type_list1_table[h->mb.i_type][i];
+        }
+        if( h->mb.i_partition == D_16x16 )
+        {
+            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 4 );
+            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 4, 4 );
+            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 4, 4 );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 4, 2 );
+            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 4, 2 );
+            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 4, 2 );
+
+            if( b_list0[1] && b_list1[1] ) x264_mb_mc_01xywh( h, 0, 2, 4, 2 );
+            else if( b_list0[1] )          x264_mb_mc_0xywh ( h, 0, 2, 4, 2 );
+            else if( b_list1[1] )          x264_mb_mc_1xywh ( h, 0, 2, 4, 2 );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            if( b_list0[0] && b_list1[0] ) x264_mb_mc_01xywh( h, 0, 0, 2, 4 );
+            else if( b_list0[0] )          x264_mb_mc_0xywh ( h, 0, 0, 2, 4 );
+            else if( b_list1[0] )          x264_mb_mc_1xywh ( h, 0, 0, 2, 4 );
+
+            if( b_list0[1] && b_list1[1] ) x264_mb_mc_01xywh( h, 2, 0, 2, 4 );
+            else if( b_list0[1] )          x264_mb_mc_0xywh ( h, 2, 0, 2, 4 );
+            else if( b_list1[1] )          x264_mb_mc_1xywh ( h, 2, 0, 2, 4 );
+        }
+    }
+}
+
+void x264_macroblock_cache_init( x264_t *h )
+{
+    int i_mb_count  = h->sps->i_mb_width * h->sps->i_mb_height;
+
+    h->mb.i_mb_stride = h->sps->i_mb_width;
+
+    h->mb.type= x264_malloc( i_mb_count * sizeof( int8_t) );
+    h->mb.qp  = x264_malloc( i_mb_count * sizeof( int8_t) );
+    h->mb.cbp = x264_malloc( i_mb_count * sizeof( int16_t) );
+
+    /* 0 -> 3 top(4), 4 -> 6 : left(3) */
+    h->mb.intra4x4_pred_mode = x264_malloc( i_mb_count * 7 * sizeof( int8_t ) );
+
+    /* all coeffs */
+    h->mb.non_zero_count = x264_malloc( i_mb_count * 24 * sizeof( uint8_t ) );
+
+    h->mb.mv[0]  = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+    h->mb.mv[1]  = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+    h->mb.ref[0] = x264_malloc( 4 * i_mb_count * sizeof( int16_t ) );
+    h->mb.ref[1] = x264_malloc( 4 * i_mb_count * sizeof( int16_t ) );
+
+    if( h->param.b_cabac )
+    {
+        h->mb.chroma_pred_mode = x264_malloc( i_mb_count * sizeof( int8_t) );
+        h->mb.mvd[0] = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+        h->mb.mvd[1] = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+    }
+
+    /* init with not avaiable (for top right idx=7,15) */
+    memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
+    memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
+}
+void x264_macroblock_cache_end( x264_t *h )
+{
+    if( h->param.b_cabac )
+    {
+        x264_free( h->mb.chroma_pred_mode );
+        x264_free( h->mb.mvd[0] );
+        x264_free( h->mb.mvd[1] );
+    }
+    x264_free( h->mb.mv[0] );
+    x264_free( h->mb.mv[1] );
+    x264_free( h->mb.ref[0] );
+    x264_free( h->mb.ref[1] );
+    x264_free( h->mb.intra4x4_pred_mode );
+    x264_free( h->mb.non_zero_count );
+    x264_free( h->mb.cbp );
+    x264_free( h->mb.qp );
+    x264_free( h->mb.type );
+}
+
+
+void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
+{
+    const int i_mb_4x4 = 16 * h->mb.i_mb_stride *i_mb_y + 4 * i_mb_x;
+    const int i_mb_8x8 =  4 * h->mb.i_mb_stride *i_mb_y + 2 * i_mb_x;
+
+    int i_top_xy = -1;
+    int i_left_xy = -1;
+    int i_top_type = -1;    /* gcc warn */
+    int i_left_type= -1;
+
+    int i;
+
+    /* init index */
+    h->mb.i_mb_x = i_mb_x;
+    h->mb.i_mb_y = i_mb_y;
+    h->mb.i_mb_xy = i_mb_y * h->mb.i_mb_stride + i_mb_x;
+    h->mb.i_neighbour = 0;
+
+    /* load picture pointers */
+    for( i = 0; i < 3; i++ )
+    {
+        const int w = (i == 0 ? 16 : 8);
+        const int i_stride = h->fdec->i_stride[i];
+        int   j;
+
+        h->mb.pic.i_stride[i] = i_stride;
+
+        h->mb.pic.p_fenc[i] = &h->fenc->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+
+        h->mb.pic.p_fdec[i] = &h->fdec->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+
+        for( j = 0; j < h->i_ref0; j++ )
+        {
+            h->mb.pic.p_fref[0][j][i] = &h->fref0[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+        }
+        for( j = 0; j < h->i_ref1; j++ )
+        {
+            h->mb.pic.p_fref[1][j][i] = &h->fref1[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+        }
+    }
+
+    /* load cache */
+    if( i_mb_y > 0 )
+    {
+        i_top_xy  = h->mb.i_mb_xy - h->mb.i_mb_stride;
+        i_top_type= h->mb.type[i_top_xy];
+
+        h->mb.i_neighbour |= MB_TOP;
+
+        /* load intra4x4 */
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][0];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][1];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][2];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][3];
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[0] - 8] = h->mb.non_zero_count[i_top_xy][10];
+        h->mb.cache.non_zero_count[x264_scan8[1] - 8] = h->mb.non_zero_count[i_top_xy][11];
+        h->mb.cache.non_zero_count[x264_scan8[4] - 8] = h->mb.non_zero_count[i_top_xy][14];
+        h->mb.cache.non_zero_count[x264_scan8[5] - 8] = h->mb.non_zero_count[i_top_xy][15];
+
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] = h->mb.non_zero_count[i_top_xy][16+2];
+        h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] = h->mb.non_zero_count[i_top_xy][16+3];
+
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] = h->mb.non_zero_count[i_top_xy][16+4+2];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = h->mb.non_zero_count[i_top_xy][16+4+3];
+    }
+    else
+    {
+        /* load intra4x4 */
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = -1;
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[0] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[1] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[4] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[5] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] =
+        h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = 0x80;
+
+    }
+
+    if( i_mb_x > 0 )
+    {
+        i_left_xy  = h->mb.i_mb_xy - 1;
+        i_left_type= h->mb.type[i_left_xy];
+
+        h->mb.i_neighbour |= MB_LEFT;
+
+        /* load intra4x4 */
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][4];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][5];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][6];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][3];
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][5];
+        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[i_left_xy][7];
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][13];
+        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[i_left_xy][15];
+
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[i_left_xy][16+1];
+        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = h->mb.non_zero_count[i_left_xy][16+3];
+
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = h->mb.non_zero_count[i_left_xy][16+4+1];
+        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = h->mb.non_zero_count[i_left_xy][16+4+3];
+    }
+    else
+    {
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = -1;
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[10] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = 0x80;
+    }
+
+    if( i_mb_y > 0 && i_mb_x < h->sps->i_mb_width - 1 )
+    {
+        h->mb.i_neighbour |= MB_TOPRIGHT;
+    }
+
+    /* load ref/mv/mvd */
+    if( h->sh.i_type != SLICE_TYPE_I )
+    {
+        int s8x8 = 2 * h->mb.i_mb_stride;
+        int s4x4 = 4 * h->mb.i_mb_stride;
+
+        int i_top_left_xy   = -1;
+        int i_top_right_xy  = -1;
+
+        int i_list;
+
+        if( h->mb.i_mb_y > 0 && h->mb.i_mb_x > 0 )
+        {
+            i_top_left_xy   = i_top_xy - 1;
+        }
+        if( h->mb.i_mb_y > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 )
+        {
+            i_top_right_xy = i_top_xy + 1;
+        }
+
+        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_P ? 1  : 2 ); i_list++ )
+        {
+            /*
+            h->mb.cache.ref[i_list][x264_scan8[5 ]+1] =
+            h->mb.cache.ref[i_list][x264_scan8[7 ]+1] =
+            h->mb.cache.ref[i_list][x264_scan8[13]+1] = -2;
+            */
+
+            if( i_top_left_xy >= 0 )
+            {
+                const int i8 = x264_scan8[0] - 1 - 1*8;
+                const int ir = i_mb_8x8 - s8x8 - 1;
+                const int iv = i_mb_4x4 - s4x4 - 1;
+                h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
+                h->mb.cache.mv[i_list][i8][0] = h->mb.mv[i_list][iv][0];
+                h->mb.cache.mv[i_list][i8][1] = h->mb.mv[i_list][iv][1];
+            }
+            else
+            {
+                const int i8 = x264_scan8[0] - 1 - 1*8;
+                h->mb.cache.ref[i_list][i8] = -2;
+                h->mb.cache.mv[i_list][i8][0] = 0;
+                h->mb.cache.mv[i_list][i8][1] = 0;
+            }
+
+            if( i_top_xy >= 0 )
+            {
+                const int i8 = x264_scan8[0] - 8;
+                const int ir = i_mb_8x8 - s8x8;
+                const int iv = i_mb_4x4 - s4x4;
+
+                h->mb.cache.ref[i_list][i8+0] =
+                h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0];
+                h->mb.cache.ref[i_list][i8+2] =
+                h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1];
+
+                for( i = 0; i < 4; i++ )
+                {
+                    h->mb.cache.mv[i_list][i8+i][0] = h->mb.mv[i_list][iv + i][0];
+                    h->mb.cache.mv[i_list][i8+i][1] = h->mb.mv[i_list][iv + i][1];
+                }
+            }
+            else
+            {
+                const int i8 = x264_scan8[0] - 8;
+                for( i = 0; i < 4; i++ )
+                {
+                    h->mb.cache.ref[i_list][i8+i] = -2;
+                    h->mb.cache.mv[i_list][i8+i][0] =
+                    h->mb.cache.mv[i_list][i8+i][1] = 0;
+                }
+            }
+
+            if( i_top_right_xy >= 0 )
+            {
+                const int i8 = x264_scan8[0] + 4 - 1*8;
+                const int ir = i_mb_8x8 - s8x8 + 2;
+                const int iv = i_mb_4x4 - s4x4 + 4;
+
+                h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
+                h->mb.cache.mv[i_list][i8][0] = h->mb.mv[i_list][iv][0];
+                h->mb.cache.mv[i_list][i8][1] = h->mb.mv[i_list][iv][1];
+            }
+            else
+            {
+                const int i8 = x264_scan8[0] + 4 - 1*8;
+                h->mb.cache.ref[i_list][i8] = -2;
+                h->mb.cache.mv[i_list][i8][0] = 0;
+                h->mb.cache.mv[i_list][i8][1] = 0;
+            }
+
+            if( i_left_xy >= 0 )
+            {
+                const int i8 = x264_scan8[0] - 1;
+                const int ir = i_mb_8x8 - 1;
+                const int iv = i_mb_4x4 - 1;
+
+                h->mb.cache.ref[i_list][i8+0*8] =
+                h->mb.cache.ref[i_list][i8+1*8] = h->mb.ref[i_list][ir + 0*s8x8];
+                h->mb.cache.ref[i_list][i8+2*8] =
+                h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8];
+
+                for( i = 0; i < 4; i++ )
+                {
+                    h->mb.cache.mv[i_list][i8+i*8][0] = h->mb.mv[i_list][iv + i*s4x4][0];
+                    h->mb.cache.mv[i_list][i8+i*8][1] = h->mb.mv[i_list][iv + i*s4x4][1];
+                }
+            }
+            else
+            {
+                const int i8 = x264_scan8[0] - 1;
+                for( i = 0; i < 4; i++ )
+                {
+                    h->mb.cache.ref[i_list][i8+i*8] = -2;
+                    h->mb.cache.mv[i_list][i8+i*8][0] =
+                    h->mb.cache.mv[i_list][i8+i*8][1] = 0;
+                }
+            }
+
+            if( h->param.b_cabac )
+            {
+                if( i_top_xy >= 0 )
+                {
+                    const int i8 = x264_scan8[0] - 8;
+                    const int iv = i_mb_4x4 - s4x4;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        h->mb.cache.mvd[i_list][i8+i][0] = h->mb.mvd[i_list][iv + i][0];
+                        h->mb.cache.mvd[i_list][i8+i][1] = h->mb.mvd[i_list][iv + i][1];
+                    }
+                }
+                else
+                {
+                    const int i8 = x264_scan8[0] - 8;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        h->mb.cache.mvd[i_list][i8+i][0] =
+                        h->mb.cache.mvd[i_list][i8+i][1] = 0;
+                    }
+                }
+
+                if( i_left_xy >= 0 )
+                {
+                    const int i8 = x264_scan8[0] - 1;
+                    const int iv = i_mb_4x4 - 1;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        h->mb.cache.mvd[i_list][i8+i*8][0] = h->mb.mvd[i_list][iv + i*s4x4][0];
+                        h->mb.cache.mvd[i_list][i8+i*8][1] = h->mb.mvd[i_list][iv + i*s4x4][1];
+                    }
+                }
+                else
+                {
+                    const int i8 = x264_scan8[0] - 1;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        h->mb.cache.mvd[i_list][i8+i*8][0] =
+                        h->mb.cache.mvd[i_list][i8+i*8][1] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void x264_macroblock_cache_save( x264_t *h )
+{
+    const int i_mb_xy = h->mb.i_mb_xy;
+    const int i_mb_type = h->mb.i_type;
+    const int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
+    const int i_mb_8x8 =  4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
+
+    int i;
+
+    h->mb.i_last_dqp = h->mb.qp[i_mb_xy] - h->mb.i_last_qp;
+    h->mb.i_last_qp = h->mb.qp[i_mb_xy];
+
+    /* save intra4x4 */
+    if( i_mb_type == I_4x4 )
+    {
+        h->mb.intra4x4_pred_mode[i_mb_xy][0] = h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[11] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][2] = h->mb.cache.intra4x4_pred_mode[x264_scan8[14] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][3] = h->mb.cache.intra4x4_pred_mode[x264_scan8[15] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][4] = h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][5] = h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ];
+        h->mb.intra4x4_pred_mode[i_mb_xy][6] = h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ];
+    }
+    else
+    {
+        h->mb.intra4x4_pred_mode[i_mb_xy][0] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][1] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][2] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][3] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][4] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][5] =
+        h->mb.intra4x4_pred_mode[i_mb_xy][6] = I_PRED_4x4_DC;
+    }
+
+    if( i_mb_type == I_PCM )
+    {
+        h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
+        for( i = 0; i < 16 + 2*4; i++ )
+        {
+            h->mb.non_zero_count[i_mb_xy][i] = 16;
+        }
+    }
+    else
+    {
+        /* save non zero count */
+        for( i = 0; i < 16 + 2*4; i++ )
+        {
+            h->mb.non_zero_count[i_mb_xy][i] = h->mb.cache.non_zero_count[x264_scan8[i]];
+        }
+    }
+
+    if( !IS_INTRA( i_mb_type ) )
+    {
+        int i_list;
+        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_P ? 1  : 2 ); i_list++ )
+        {
+            const int s8x8 = 2 * h->mb.i_mb_stride;
+            const int s4x4 = 4 * h->mb.i_mb_stride;
+            int y,x;
+
+            h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[0]];
+            h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[4]];
+            h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[8]];
+            h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[12]];
+
+            for( y = 0; y < 4; y++ )
+            {
+                for( x = 0; x < 4; x++ )
+                {
+                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][0] = h->mb.cache.mv[i_list][x264_scan8[0]+x+8*y][0];
+                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][1] = h->mb.cache.mv[i_list][x264_scan8[0]+x+8*y][1];
+                }
+            }
+        }
+    }
+    else
+    {
+        int i_list;
+        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_P ? 1  : 2 ); i_list++ )
+        {
+            const int s8x8 = 2 * h->mb.i_mb_stride;
+            const int s4x4 = 4 * h->mb.i_mb_stride;
+            int y,x;
+
+            h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] =
+            h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] =
+            h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] =
+            h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = -1;
+
+            for( y = 0; y < 4; y++ )
+            {
+                for( x = 0; x < 4; x++ )
+                {
+                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][0] = 0;
+                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][1] = 0;
+                }
+            }
+        }
+    }
+
+    if( h->param.b_cabac )
+    {
+        if( i_mb_type == I_4x4 || i_mb_type == I_16x16 )
+            h->mb.chroma_pred_mode[i_mb_xy] = h->mb.i_chroma_pred_mode;
+        else
+            h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
+
+        if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) )
+        {
+            int i_list;
+            for( i_list  = 0; i_list < 2; i_list++ )
+            {
+                const int s4x4 = 4 * h->mb.i_mb_stride;
+                int y,x;
+                for( y = 0; y < 4; y++ )
+                {
+                    for( x = 0; x < 4; x++ )
+                    {
+                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][0] = h->mb.cache.mvd[i_list][x264_scan8[0]+x+8*y][0];
+                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][1] = h->mb.cache.mvd[i_list][x264_scan8[0]+x+8*y][1];
+                    }
+                }
+            }
+        }
+        else
+        {
+            int i_list;
+            for( i_list  = 0; i_list < 2; i_list++ )
+            {
+                const int s4x4 = 4 * h->mb.i_mb_stride;
+                int y,x;
+                for( y = 0; y < 4; y++ )
+                {
+                    for( x = 0; x < 4; x++ )
+                    {
+                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][0] = 0;
+                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][1] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
diff --git a/core/macroblock.h b/core/macroblock.h
new file mode 100644
index 00000000..5419f87e
--- /dev/null
+++ b/core/macroblock.h
@@ -0,0 +1,204 @@
+/*****************************************************************************
+ * macroblock.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _MACROBLOCK_H
+#define _MACROBLOCK_H 1
+
+enum macroblock_position_e
+{
+    MB_LEFT     = 0x01,
+    MB_TOP      = 0x02,
+    MB_TOPRIGHT = 0x04,
+
+    MB_PRIVATE  = 0x10,
+};
+
+
+/* XXX mb_type isn't the one written in the bitstream -> only internal usage */
+#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_16x16 )
+#define IS_SKIP(type)  ( (type) == P_SKIP || (type) == B_SKIP )
+enum mb_class_e
+{
+    I_4x4           = 0,
+    I_16x16         = 1,
+    I_PCM           = 2,
+
+    P_L0            = 3,
+    P_8x8           = 4,
+    P_SKIP          = 5,
+
+    B_DIRECT        = 6,
+    B_L0_L0         = 7,
+    B_L0_L1         = 8,
+    B_L0_BI         = 9,
+    B_L1_L0         = 10,
+    B_L1_L1         = 11,
+    B_L1_BI         = 12,
+    B_BI_L0         = 13,
+    B_BI_L1         = 14,
+    B_BI_BI         = 15,
+    B_8x8           = 16,
+    B_SKIP          = 17,
+};
+static const int x264_mb_type_list0_table[18][2] =
+{
+    {0,0}, {0,0}, {0,0},    /* INTRA */
+    {1,1},                  /* P_L0 */
+    {0,0},                  /* P_8x8 */
+    {1,1},                  /* P_SKIP */
+    {0,0},                  /* B_DIRECT */
+    {1,1}, {1,0}, {1,1},    /* B_L0_* */
+    {0,1}, {0,0}, {0,1},    /* B_L1_* */
+    {1,1}, {1,0}, {1,1},    /* B_BI_* */
+    {0,0},                  /* B_8x8 */
+    {0,0}                   /* B_SKIP */
+};
+static const int x264_mb_type_list1_table[18][2] =
+{
+    {0,0}, {0,0}, {0,0},    /* INTRA */
+    {0,0},                  /* P_L0 */
+    {0,0},                  /* P_8x8 */
+    {0,0},                  /* P_SKIP */
+    {0,0},                  /* B_DIRECT */
+    {0,0}, {0,1}, {0,1},    /* B_L0_* */
+    {1,0}, {1,1}, {1,1},    /* B_L1_* */
+    {1,0}, {1,1}, {1,1},    /* B_BI_* */
+    {0,0},                  /* B_8x8 */
+    {0,0}                   /* B_SKIP */
+};
+
+#define IS_SUB4x4(type) ( (type ==D_L0_4x4)||(type ==D_L1_4x4)||(type ==D_BI_4x4))
+#define IS_SUB4x8(type) ( (type ==D_L0_4x8)||(type ==D_L1_4x8)||(type ==D_BI_4x8))
+#define IS_SUB8x4(type) ( (type ==D_L0_8x4)||(type ==D_L1_8x4)||(type ==D_BI_8x4))
+#define IS_SUB8x8(type) ( (type ==D_L0_8x8)||(type ==D_L1_8x8)||(type ==D_BI_8x8)||(type ==D_DIRECT_8x8))
+enum mb_partition_e
+{
+    /* sub partition type for P_8x8 and B_8x8 */
+    D_L0_4x4        = 0,
+    D_L0_8x4        = 1,
+    D_L0_4x8        = 2,
+    D_L0_8x8        = 3,
+
+    /* sub partition type for B_8x8 only */
+    D_L1_4x4        = 4,
+    D_L1_8x4        = 5,
+    D_L1_4x8        = 6,
+    D_L1_8x8        = 7,
+
+    D_BI_4x4        = 8,
+    D_BI_8x4        = 9,
+    D_BI_4x8        = 10,
+    D_BI_8x8        = 11,
+    D_DIRECT_8x8    = 12,
+
+    /* partition */
+    D_8x8           = 13,
+    D_16x8          = 14,
+    D_8x16          = 15,
+    D_16x16         = 16,
+};
+
+static const int x264_mb_partition_count_table[17] =
+{
+    /* sub L0 */
+    4, 2, 2, 1,
+    /* sub L1 */
+    4, 2, 2, 1,
+    /* sub BI */
+    4, 2, 2, 1,
+    /* Direct */
+    1,
+    /* Partition */
+    4, 2, 2, 1
+};
+
+void x264_macroblock_cache_init( x264_t *h );
+void x264_macroblock_cache_load( x264_t *h, int, int );
+void x264_macroblock_cache_save( x264_t *h );
+void x264_macroblock_cache_end( x264_t *h );
+
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int i_qscale );
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int i_qscale );
+void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale );
+
+/* x264_mb_predict_mv_16x16:
+ *      set mvp with predicted mv for D_16x16 block
+ *      h->mb. need only valid values from others block */
+void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] );
+/* x264_mb_predict_mv_pskip:
+ *      set mvp with predicted mv for P_SKIP
+ *      h->mb. need only valid values from others block */
+void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] );
+/* x264_mb_predict_mv:
+ *      set mvp with predicted mv for all blocks except P_SKIP
+ *      h->mb. need valid ref/partition/sub of current block to be valid
+ *      and valid mv/ref from others block . */
+void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] );
+
+
+int  x264_mb_predict_intra4x4_mode( x264_t *h, int idx );
+int  x264_mb_predict_non_zero_code( x264_t *h, int idx );
+
+void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale );
+
+void x264_mb_mc( x264_t *h );
+
+
+static inline void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int ref )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.ref[i_list][X264_SCAN8_0+x+dx+8*(y+dy)] = ref;
+        }
+    }
+}
+static inline void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, int mvx, int mvy )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.mv[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][0] = mvx;
+            h->mb.cache.mv[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][1] = mvy;
+        }
+    }
+}
+static inline void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, int mdx, int mdy )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.mvd[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][0] = mdx;
+            h->mb.cache.mvd[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][1] = mdy;
+        }
+    }
+}
+
+#endif
+
diff --git a/core/mc.c b/core/mc.c
new file mode 100644
index 00000000..e7ff7541
--- /dev/null
+++ b/core/mc.c
@@ -0,0 +1,320 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+
+#include "mc.h"
+#include "clip1.h"
+
+#ifdef _MSC_VER
+#undef HAVE_MMXEXT  /* not finished now */
+#endif
+#ifdef HAVE_MMXEXT
+#   include "i386/mc.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#   include "ppc/mc.h"
+#endif
+
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
+}
+
+static inline void pixel_avg( uint8_t *dst,  int i_dst_stride,
+                              uint8_t *src1, int i_src1_stride,
+                              uint8_t *src2, int i_src2_stride,
+                              int i_width, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+
+typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height );
+
+static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, i_width );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hh( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) + 16 ) >> 5 );
+        }
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hv( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter( &src[x], i_src_stride ) + 16 ) >> 5 );
+        }
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hc( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t *out;
+    uint8_t *pix;
+    int x, y;
+
+    for( x = 0; x < i_width; x++ )
+    {
+        int tap[6];
+
+        pix = &src[x];
+        out = &dst[x];
+
+        tap[0] = x264_tapfilter1( &pix[-2*i_src_stride] );
+        tap[1] = x264_tapfilter1( &pix[-1*i_src_stride] );
+        tap[2] = x264_tapfilter1( &pix[ 0*i_src_stride] );
+        tap[3] = x264_tapfilter1( &pix[ 1*i_src_stride] );
+        tap[4] = x264_tapfilter1( &pix[ 2*i_src_stride] );
+
+        for( y = 0; y < i_height; y++ )
+        {
+            tap[5] = x264_tapfilter1( &pix[ 3*i_src_stride] );
+
+            *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] + 20 * tap[3] -5*tap[4] + tap[5] + 512 ) >> 10 );
+
+            /* Next line */
+            pix += i_src_stride;
+            out += i_dst_stride;
+            tap[0] = tap[1];
+            tap[1] = tap[2];
+            tap[2] = tap[3];
+            tap[3] = tap[4];
+            tap[4] = tap[5];
+        }
+    }
+}
+
+/* mc I+H */
+static void mc_xy10( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src, i_src_stride, tmp, i_width, i_width, i_height );
+}
+static void mc_xy30( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src+1, i_src_stride, tmp, i_width, i_width, i_height );
+}
+/* mc I+V */
+static void mc_xy01( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src, i_src_stride, tmp, i_width, i_width, i_height );
+}
+static void mc_xy03( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, i_width, i_width, i_height );
+}
+/* H+V */
+static void mc_xy11( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy31( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src+1, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src,   i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy13( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src,              i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy33( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src+1,            i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy21( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy12( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hv( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy32( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src,   i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hv( src+1, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy23( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src,              i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src_stride,
+                                      uint8_t *dst, int i_dst_stride,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static pf_mc_t pf_mc[4][4] =    /*XXX [dqy][dqx] */
+    {
+        { mc_copy,  mc_xy10,    mc_hh,      mc_xy30 },
+        { mc_xy01,  mc_xy11,    mc_xy21,    mc_xy31 },
+        { mc_hv,    mc_xy12,    mc_hc,      mc_xy32 },
+        { mc_xy03,  mc_xy13,    mc_xy23,    mc_xy33 },
+    };
+
+    src += (mvy >> 2) * i_src_stride + (mvx >> 2);
+    pf_mc[mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_width, i_height );
+}
+
+/* full chroma mc (ie until 1/8 pixel)*/
+static void motion_compensation_chroma( uint8_t *src, int i_src_stride,
+                                        uint8_t *dst, int i_dst_stride,
+                                        int mvx, int mvy,
+                                        int i_width, int i_height )
+{
+    uint8_t *srcp;
+    int x, y;
+
+    const int d8x = mvx&0x07;
+    const int d8y = mvy&0x07;
+
+    const int cA = (8-d8x)*(8-d8y);
+    const int cB = d8x    *(8-d8y);
+    const int cC = (8-d8x)*d8y;
+    const int cD = d8x    *d8y;
+
+    src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
+    srcp = &src[i_src_stride];
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = ( cA*src[x]  + cB*src[x+1] +
+                       cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6;
+        }
+        dst  += i_dst_stride;
+
+        src   = srcp;
+        srcp += i_src_stride;
+    }
+}
+
+void x264_mc_init( int cpu, x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA]   = motion_compensation_luma;
+    pf[MC_CHROMA] = motion_compensation_chroma;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_mc_mmxext_init( pf );
+    }
+#endif
+#ifdef HAVE_ALTIVEC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        x264_mc_altivec_init( pf );
+    }
+#endif
+}
+
diff --git a/core/mc.h b/core/mc.h
new file mode 100644
index 00000000..8f91eab1
--- /dev/null
+++ b/core/mc.h
@@ -0,0 +1,45 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _MC_H
+#define _MC_H 1
+
+/* Do the MC
+ * XXX: Only width = 4, 8 or 16 are valid
+ * width == 4 -> height == 4 or 8
+ * width == 8 -> height == 4 or 8 or 16
+ * width == 16-> height == 8 or 16
+ * */
+
+typedef void (*x264_mc_function_t)(uint8_t *, int, uint8_t *, int,
+                          int mvx, int mvy,
+                          int i_width, int i_height );
+enum
+{
+    MC_LUMA   = 0,
+    MC_CHROMA = 1,
+};
+
+void x264_mc_init( int cpu, x264_mc_function_t pf[2] );
+
+#endif
diff --git a/core/mdate.c b/core/mdate.c
new file mode 100644
index 00000000..5b4a2a91
--- /dev/null
+++ b/core/mdate.c
@@ -0,0 +1,48 @@
+/*****************************************************************************
+ * mdate.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mdate.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#include <sys/time.h>
+#else
+#include <sys/types.h>
+#include <sys/timeb.h>
+#endif
+#include <time.h>
+
+int64_t x264_mdate( void )
+{
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+    struct timeval tv_date;
+
+    gettimeofday( &tv_date, NULL );
+    return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
+#else
+    struct _timeb tb;
+    _ftime(&tb);
+    return ((int64_t)tb.time * (1000) + (int64_t)tb.millitm) * (1000);
+#endif
+}
+
diff --git a/core/pixel.c b/core/pixel.c
new file mode 100644
index 00000000..f06a099e
--- /dev/null
+++ b/core/pixel.c
@@ -0,0 +1,228 @@
+/*****************************************************************************
+ * pixel.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "pixel.h"
+
+#ifdef HAVE_MMXEXT
+#   include "i386/pixel.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#   include "ppc/pixel.h"
+#endif
+
+
+/****************************************************************************
+ * pixel_sad_WxH
+ ****************************************************************************/
+#define PIXEL_SAD_C( name, lx, ly ) \
+static int name( uint8_t *pix1, int i_stride_pix1,  \
+                 uint8_t *pix2, int i_stride_pix2 ) \
+{                                                   \
+    int i_sum = 0;                                  \
+    int x, y;                                       \
+    for( y = 0; y < ly; y++ )                       \
+    {                                               \
+        for( x = 0; x < lx; x++ )                   \
+        {                                           \
+            i_sum += abs( pix1[x] - pix2[x] );      \
+        }                                           \
+        pix1 += i_stride_pix1;                      \
+        pix2 += i_stride_pix2;                      \
+    }                                               \
+    return i_sum;                                   \
+}
+
+
+PIXEL_SAD_C( pixel_sad_16x16, 16, 16 )
+PIXEL_SAD_C( pixel_sad_16x8,  16,  8 )
+PIXEL_SAD_C( pixel_sad_8x16,   8, 16 )
+PIXEL_SAD_C( pixel_sad_8x8,    8,  8 )
+PIXEL_SAD_C( pixel_sad_8x4,    8,  4 )
+PIXEL_SAD_C( pixel_sad_4x8,    4,  8 )
+PIXEL_SAD_C( pixel_sad_4x4,    4,  4 )
+
+static void pixel_sub_4x4( int16_t diff[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    int y, x;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            diff[y][x] = pix1[x] - pix2[x];
+        }
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+}
+
+static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
+{
+    int16_t tmp[4][4];
+    int16_t diff[4][4];
+    int x, y;
+    int i_satd = 0;
+
+    for( y = 0; y < i_height; y += 4 )
+    {
+        for( x = 0; x < i_width; x += 4 )
+        {
+            int d;
+
+            pixel_sub_4x4( diff, &pix1[x], i_pix1, &pix2[x], i_pix2 );
+
+            for( d = 0; d < 4; d++ )
+            {
+                int s01, s23;
+                int d01, d23;
+
+                s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
+                d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];
+
+                tmp[d][0] = s01 + s23;
+                tmp[d][1] = s01 - s23;
+                tmp[d][2] = d01 - d23;
+                tmp[d][3] = d01 + d23;
+            }
+            for( d = 0; d < 4; d++ )
+            {
+                int s01, s23;
+                int d01, d23;
+
+                s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
+                d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];
+
+                i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
+            }
+
+        }
+        pix1 += 4 * i_pix1;
+        pix2 += 4 * i_pix2;
+    }
+
+    return i_satd / 2;
+}
+#define PIXEL_SATD_C( name, width, height ) \
+static int name( uint8_t *pix1, int i_stride_pix1, \
+                 uint8_t *pix2, int i_stride_pix2 ) \
+{ \
+    return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
+}
+PIXEL_SATD_C( pixel_satd_16x16, 16, 16 )
+PIXEL_SATD_C( pixel_satd_16x8,  16, 8 )
+PIXEL_SATD_C( pixel_satd_8x16,  8, 16 )
+PIXEL_SATD_C( pixel_satd_8x8,   8, 8 )
+PIXEL_SATD_C( pixel_satd_8x4,   8, 4 )
+PIXEL_SATD_C( pixel_satd_4x8,   4, 8 )
+PIXEL_SATD_C( pixel_satd_4x4,   4, 4 )
+
+
+static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height )
+{
+    int x, y;
+    for( y = 0; y < height; y++ )
+    {
+        for( x = 0; x < width; x++ )
+        {
+            dst[x] = ( dst[x] + src[x] + 1 ) >> 1;
+        }
+        dst += i_dst;
+        src += i_src;
+    }
+}
+
+
+#define PIXEL_AVG_C( name, width, height ) \
+static void name( uint8_t *pix1, int i_stride_pix1, \
+                  uint8_t *pix2, int i_stride_pix2 ) \
+{ \
+    pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
+}
+PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
+PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
+PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
+PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
+PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
+PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
+PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
+
+/****************************************************************************
+ * x264_pixel_init:
+ ****************************************************************************/
+void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
+{
+    pixf->sad[PIXEL_16x16] = pixel_sad_16x16;
+    pixf->sad[PIXEL_16x8]  = pixel_sad_16x8;
+    pixf->sad[PIXEL_8x16]  = pixel_sad_8x16;
+    pixf->sad[PIXEL_8x8]   = pixel_sad_8x8;
+    pixf->sad[PIXEL_8x4]   = pixel_sad_8x4;
+    pixf->sad[PIXEL_4x8]   = pixel_sad_4x8;
+    pixf->sad[PIXEL_4x4]   = pixel_sad_4x4;
+
+    pixf->satd[PIXEL_16x16]= pixel_satd_16x16;
+    pixf->satd[PIXEL_16x8] = pixel_satd_16x8;
+    pixf->satd[PIXEL_8x16] = pixel_satd_8x16;
+    pixf->satd[PIXEL_8x8]  = pixel_satd_8x8;
+    pixf->satd[PIXEL_8x4]  = pixel_satd_8x4;
+    pixf->satd[PIXEL_4x8]  = pixel_satd_4x8;
+    pixf->satd[PIXEL_4x4]  = pixel_satd_4x4;
+
+    pixf->avg[PIXEL_16x16]= pixel_avg_16x16;
+    pixf->avg[PIXEL_16x8] = pixel_avg_16x8;
+    pixf->avg[PIXEL_8x16] = pixel_avg_8x16;
+    pixf->avg[PIXEL_8x8]  = pixel_avg_8x8;
+    pixf->avg[PIXEL_8x4]  = pixel_avg_8x4;
+    pixf->avg[PIXEL_4x8]  = pixel_avg_4x8;
+    pixf->avg[PIXEL_4x4]  = pixel_avg_4x4;
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext;
+        pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_mmxext;
+        pixf->sad[PIXEL_8x16 ] = x264_pixel_sad_8x16_mmxext;
+        pixf->sad[PIXEL_8x8  ] = x264_pixel_sad_8x8_mmxext;
+        pixf->sad[PIXEL_8x4  ] = x264_pixel_sad_8x4_mmxext;
+        pixf->sad[PIXEL_4x8  ] = x264_pixel_sad_4x8_mmxext;
+        pixf->sad[PIXEL_4x4]   = x264_pixel_sad_4x4_mmxext;
+
+        pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext;
+        pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext;
+        pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext;
+        pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_mmxext;
+        pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_mmxext;
+        pixf->satd[PIXEL_4x8]  = x264_pixel_satd_4x8_mmxext;
+        pixf->satd[PIXEL_4x4]  = x264_pixel_satd_4x4_mmxext;
+    }
+#endif
+#ifdef HAVE_ALTIVEC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        x264_pixel_altivec_init( pixf );
+    }
+#endif
+}
+
diff --git a/core/pixel.h b/core/pixel.h
new file mode 100644
index 00000000..a8055b5c
--- /dev/null
+++ b/core/pixel.h
@@ -0,0 +1,62 @@
+/*****************************************************************************
+ * pixel.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PIXEL_H
+#define _PIXEL_H 1
+
+typedef int  (*x264_pixel_sad_t) ( uint8_t *, int, uint8_t *, int );
+typedef int  (*x264_pixel_satd_t)( uint8_t *, int, uint8_t *, int );
+typedef void (*x264_pixel_avg_t) ( uint8_t *, int, uint8_t *, int );
+
+enum
+{
+    PIXEL_16x16 = 0,
+    PIXEL_16x8  = 1,
+    PIXEL_8x16  = 2,
+    PIXEL_8x8   = 3,
+    PIXEL_8x4   = 4,
+    PIXEL_4x8   = 5,
+    PIXEL_4x4   = 6,
+};
+
+static const struct {
+    int w;
+    int h;
+} x264_pixel_size[7] = {
+    { 16, 16 },
+    { 16,  8 }, {  8, 16 },
+    {  8,  8 },
+    {  8,  4 }, {  4,  8 },
+    {  4,  4 }
+};
+
+typedef struct
+{
+    x264_pixel_sad_t  sad[7];
+    x264_pixel_satd_t satd[7];
+    x264_pixel_avg_t  avg[7];
+} x264_pixel_function_t;
+
+void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
+
+#endif
diff --git a/core/ppc/mc.c b/core/ppc/mc.c
new file mode 100644
index 00000000..f4b54d77
--- /dev/null
+++ b/core/ppc/mc.c
@@ -0,0 +1,681 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "../mc.h"
+#include "../clip1.h"
+#include "mc.h"
+#include "ppccommon.h"
+
+typedef void (*pf_mc_t)( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height );
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
+           pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
+           pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
+           pix[ 3];
+}
+
+/* pixel_avg */
+static inline void pixel_avg_w4( uint8_t *dst,  int i_dst,
+                                 uint8_t *src1, int i_src1,
+                                 uint8_t *src2, int i_src2,
+                                 int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst;
+        src1 += i_src1;
+        src2 += i_src2;
+    }
+}
+static inline void pixel_avg_w8( uint8_t *dst,  int i_dst,
+                                 uint8_t *src1, int i_src1,
+                                 uint8_t *src2, int i_src2,
+                                 int i_height )
+{
+    /* TODO - optimize */
+    pixel_avg_w4( &dst[0], i_dst, &src1[0], i_src1, &src2[0], i_src2,
+                  i_height );
+    pixel_avg_w4( &dst[4], i_dst, &src1[4], i_src1, &src2[4], i_src2,
+                  i_height );
+}
+static inline void pixel_avg_w16( uint8_t *dst,  int i_dst,
+                                  uint8_t *src1, int i_src1,
+                                  uint8_t *src2, int i_src2,
+                                  int i_height )
+{
+    int y;
+    vector_u8_t src1v, src2v;
+    for( y = 0; y < i_height; y++ )
+    {
+        LOAD_16( src1, src1v );
+        LOAD_16( src2, src2v );
+        src1v = vec_avg( src1v, src2v );
+        STORE_16( src1v, dst );
+
+        dst  += i_dst;
+        src1 += i_src1;
+        src2 += i_src2;
+    }
+}
+
+/* mc_copy: plain c */
+#define MC_COPY( name, a )                                \
+static void name( uint8_t *src, int i_src,                \
+                  uint8_t *dst, int i_dst, int i_height ) \
+{                                                         \
+    int y;                                                \
+    for( y = 0; y < i_height; y++ )                       \
+    {                                                     \
+        memcpy( dst, src, a );                            \
+        src += i_src;                                     \
+        dst += i_dst;                                     \
+    }                                                     \
+}
+MC_COPY( mc_copy_w4,  4  )
+MC_COPY( mc_copy_w8,  8  )
+MC_COPY( mc_copy_w16, 16 )
+
+/* TAP_FILTER:
+   a is source (vector_s16_t [6])
+   b is a temporary vector_s16_t
+   c is the result
+
+   c   = src[0] + a[5] - 5 * ( a[1] + a[4] ) + 20 * ( a[2] + a[3] );
+   c  += 16;
+   c >>= 5;
+   c  += 80; */
+#define TAP_FILTER( a, b, c )                       \
+    c = vec_add( a[0], a[5] );                      \
+    b = vec_add( a[1], a[4] );                      \
+    c = vec_sub( c, b );                            \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_sub( c, b );                            \
+    b = vec_add( a[2], a[3] );                      \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_add( c, b );                            \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_add( c, b );                            \
+    c = vec_add( c, vec_splat_s16( 8 ) );           \
+    c = vec_add( c, vec_splat_s16( 8 ) );           \
+    c = vec_sr( c, vec_splat_u16( 5 ) );            \
+    c = vec_add( c, vec_sl( vec_splat_s16( 5 ),     \
+                            vec_splat_u16( 4 ) ) );
+
+/* mc_hh */
+static inline void mc_hh_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) +
+                                      16 ) >> 5 );
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hh_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    DECLARE_ALIGNED( int16_t, tmp[8], 16 );
+
+    LOAD_ZERO;
+    vector_u8_t    loadv;
+    vector_s16_t   srcv[6];
+    vector_u8_t  * _srcv = (vector_u8_t*) srcv;
+    vector_s16_t   dstv;
+    vector_s16_t   tmpv;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        LOAD_16( &src[-2], loadv );
+
+        for( x = 0; x < 6; x++ )
+        {
+            _srcv[x] = vec_perm( loadv, zero_u8,
+                                 vec_lvsl( 0, (int*) x ) );
+            CONVERT_U8_TO_S16( srcv[x] );
+        }
+
+        TAP_FILTER( srcv, tmpv, dstv );
+        vec_st( dstv, 0, tmp );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1_table[tmp[x]];
+        }
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hh_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hh_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc_hv */
+static inline void mc_hv_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter( &src[x], i_src ) +
+                                      16 ) >> 5 );
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hv_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    DECLARE_ALIGNED( int16_t, tmp[8], 16 );
+
+    LOAD_ZERO;
+    vector_s16_t   srcv[6];
+    vector_u8_t  * _srcv = (vector_u8_t*) srcv;
+    vector_s16_t   dstv;
+    vector_s16_t   tmpv;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        if( y )
+        {
+            for( x = 0; x < 5; x++ )
+            {
+                srcv[x] = srcv[x+1];
+            }
+            LOAD_8( &src[3*i_src], _srcv[5] );
+            CONVERT_U8_TO_S16( srcv[5] );
+        }
+        else
+        {
+            for( x = 0; x < 6; x++ )
+            {
+                LOAD_8( &src[(x-2)*i_src], _srcv[x] );
+                CONVERT_U8_TO_S16( srcv[x] );
+            }
+        }
+
+        TAP_FILTER( srcv, tmpv, dstv );
+        vec_st( dstv, 0, tmp );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1_table[tmp[x]];
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hv_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hv_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hv_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc_hc */
+static inline void mc_hc_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t *out;
+    uint8_t *pix;
+    int x, y;
+
+    for( x = 0; x < 4; x++ )
+    {
+        int tap[6];
+
+        pix = &src[x];
+        out = &dst[x];
+
+        tap[0] = x264_tapfilter1( &pix[-2*i_src] );
+        tap[1] = x264_tapfilter1( &pix[-1*i_src] );
+        tap[2] = x264_tapfilter1( &pix[ 0*i_src] );
+        tap[3] = x264_tapfilter1( &pix[ 1*i_src] );
+        tap[4] = x264_tapfilter1( &pix[ 2*i_src] );
+
+        for( y = 0; y < i_height; y++ )
+        {
+            tap[5] = x264_tapfilter1( &pix[ 3*i_src] );
+
+            *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] +
+                                    20 * tap[3] -5*tap[4] + tap[5] +
+                                    512 ) >> 10 );
+
+            /* Next line */
+            pix += i_src;
+            out += i_dst;
+            tap[0] = tap[1];
+            tap[1] = tap[2];
+            tap[2] = tap[3];
+            tap[3] = tap[4];
+            tap[4] = tap[5];
+        }
+    }
+}
+static inline void mc_hc_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    /* TODO: optimize */
+    mc_hc_w4( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hc_w4( &src[4], i_src, &dst[4], i_dst, i_height );
+}
+static inline void mc_hc_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hc_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hc_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc I+H */
+static void mc_xy10_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hh_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
+}
+static void mc_xy10_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hh_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
+}
+static void mc_xy10_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
+}
+
+static void mc_xy30_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hh_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src + 1, i_src, tmp, 4, i_height );
+}
+static void mc_xy30_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hh_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src + 1, i_src, tmp, 8, i_height );
+}
+static void mc_xy30_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src + 1, i_src, tmp, 16, i_height );
+}
+
+/* mc I+V */
+static void mc_xy01_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hv_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
+}
+static void mc_xy01_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hv_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
+}
+static void mc_xy01_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
+}
+
+static void mc_xy03_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hv_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src + i_src, i_src, tmp, 4, i_height );
+}
+static void mc_xy03_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hv_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src + i_src, i_src, tmp, 8, i_height );
+}
+static void mc_xy03_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src + i_src, i_src, tmp, 16, i_height );
+}
+
+/* H+V */
+static void mc_xy11_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy11_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy11_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy31_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src+1, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src,   i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy31_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src+1, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src,   i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy31_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src+1, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src,   i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy13_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src,       i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy13_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src,       i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy13_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src,       i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy33_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src+1,     i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy33_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src+1,     i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy33_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src+1,     i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy21_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy21_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy21_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy12_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src, i_src, tmp1, 4, i_height );
+    mc_hv_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy12_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src, i_src, tmp1, 8, i_height );
+    mc_hv_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy12_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src, i_src, tmp1, 16, i_height );
+    mc_hv_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy32_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src,   i_src, tmp1, 4, i_height );
+    mc_hv_w4( src+1, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy32_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src,   i_src, tmp1, 8, i_height );
+    mc_hv_w8( src+1, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy32_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src,   i_src, tmp1, 16, i_height );
+    mc_hv_w16( src+1, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy23_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src,       i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy23_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src,       i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy23_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src,       i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src,
+                                      uint8_t *dst, int i_dst,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
+    {
+        {
+            { mc_copy_w4,  mc_xy10_w4,    mc_hh_w4,      mc_xy30_w4 },
+            { mc_xy01_w4,  mc_xy11_w4,    mc_xy21_w4,    mc_xy31_w4 },
+            { mc_hv_w4,    mc_xy12_w4,    mc_hc_w4,      mc_xy32_w4 },
+            { mc_xy03_w4,  mc_xy13_w4,    mc_xy23_w4,    mc_xy33_w4 },
+        },
+        {
+            { mc_copy_w8,  mc_xy10_w8,    mc_hh_w8,      mc_xy30_w8 },
+            { mc_xy01_w8,  mc_xy11_w8,    mc_xy21_w8,    mc_xy31_w8 },
+            { mc_hv_w8,    mc_xy12_w8,    mc_hc_w8,      mc_xy32_w8 },
+            { mc_xy03_w8,  mc_xy13_w8,    mc_xy23_w8,    mc_xy33_w8 },
+        },
+        {
+            { mc_copy_w16,  mc_xy10_w16,    mc_hh_w16,      mc_xy30_w16 },
+            { mc_xy01_w16,  mc_xy11_w16,    mc_xy21_w16,    mc_xy31_w16 },
+            { mc_hv_w16,    mc_xy12_w16,    mc_hc_w16,      mc_xy32_w16 },
+            { mc_xy03_w16,  mc_xy13_w16,    mc_xy23_w16,    mc_xy33_w16 },
+        }
+    };
+
+    src += (mvy >> 2) * i_src + (mvx >> 2);
+    if( i_width == 4 )
+    {
+        pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+    else if( i_width == 8 )
+    {
+        pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+    else if( i_width == 16 )
+    {
+        pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+}
+
+void x264_mc_altivec_init( x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA] = motion_compensation_luma;
+}
diff --git a/core/ppc/mc.h b/core/ppc/mc.h
new file mode 100644
index 00000000..cf006f2b
--- /dev/null
+++ b/core/ppc/mc.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PPC_MC_H
+#define _PPC_MC_H 1
+
+void x264_mc_altivec_init( x264_mc_function_t pf[2] );
+
+#endif
diff --git a/core/ppc/pixel.c b/core/ppc/pixel.c
new file mode 100644
index 00000000..16cbcc5d
--- /dev/null
+++ b/core/ppc/pixel.c
@@ -0,0 +1,215 @@
+/*****************************************************************************
+ * pixel.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "../pixel.h"
+#include "pixel.h"
+#include "ppccommon.h"
+
+/* sad routines */
+#define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b )        \
+static int name( uint8_t *pix1, int i_pix1,            \
+                 uint8_t *pix2, int i_pix2 )           \
+{                                                      \
+    int y;                                             \
+    DECLARE_ALIGNED( int, sum, 16 );                   \
+                                                       \
+    LOAD_ZERO;                                         \
+    vector_u8_t  pix1v, pix2v;                         \
+    vector_s32_t sumv = zero_s32;                      \
+    for( y = 0; y < ly; y++ )                          \
+    {                                                  \
+        LOAD_##lx( pix1, pix1v );                      \
+        LOAD_##lx( pix2, pix2v );                      \
+        sumv = (vector_s32_t) vec_sum4s(               \
+                   vec_sub( vec_max( pix1v, pix2v ),   \
+                            vec_min( pix1v, pix2v ) ), \
+                   (vector_u32_t) sumv );              \
+        pix1 += i_pix1;                                \
+        pix2 += i_pix2;                                \
+    }                                                  \
+    sumv = vec_sum##a( sumv, zero_s32 );               \
+    vec_ste( vec_splat( sumv, b ), 0, &sum );          \
+    return sum;                                        \
+}
+
+PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s,  3 )
+PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec,  8,  16, 2s, 1 )
+PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec,  16, 8,  s,  3 )
+PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
+
+/* satd routines */
+static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
+                                          uint8_t *pix2, int i_pix2 )
+{
+    int i;
+    DECLARE_ALIGNED( int, i_satd, 16 );
+
+    LOAD_ZERO;
+    vector_s32_t satdv = zero_s32;
+    vector_u8_t  pix1u8v, pix2u8v;
+    vector_s16_t pix1s16v, pix2s16v;
+    vector_s16_t diffv[8];
+    vector_s16_t tmpv[8];
+    vector_s16_t s01v, s23v, d01v, d23v;
+
+    /* Diff 8x8 */
+    for( i = 0; i < 8; i++ )
+    {
+        LOAD_8( pix1, pix1u8v );
+        LOAD_8( pix2, pix2u8v );
+
+        /* u8 -> s16 conversion */
+        pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
+        pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+
+        diffv[i] = vec_sub( pix1s16v, pix2s16v );
+
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    /* Hadamar H */
+    HADAMAR( &diffv[0], &tmpv[0] );
+    HADAMAR( &diffv[4], &tmpv[4] );
+
+    /* Transpose */
+    TRANSPOSE8x8( tmpv, diffv );
+
+    /* Hadamar V */
+    HADAMAR( &diffv[0], &tmpv[0] );
+    HADAMAR( &diffv[4], &tmpv[4] );
+
+    /* Sum of absolute values */
+    for( i = 0; i < 8; i++ )
+    {
+        satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
+    }
+    satdv = vec_sums( satdv, zero_s32 );
+
+    /* Done */
+    vec_ste( vec_splat( satdv, 3 ), 0, &i_satd );
+    return i_satd / 2;
+}
+
+static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
+                                    uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8], i_pix1,
+                                   &pix2[8], i_pix2 );
+}
+static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
+                                    uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1], i_pix1,
+                                   &pix2[8*i_pix2], i_pix2 );
+}
+static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
+                                     uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8], i_pix1,
+                                   &pix2[8], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1], i_pix1,
+                                   &pix2[8*i_pix2], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1+8], i_pix1,
+                                   &pix2[8*i_pix2+8], i_pix2 );
+}
+
+static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
+                                          uint8_t *pix2, int i_pix2 )
+{
+    int i;
+    DECLARE_ALIGNED( int, i_satd, 16 );
+
+    LOAD_ZERO;
+    vector_s32_t satdv = zero_s32;
+    vector_u8_t  pix1u8v, pix2u8v;
+    vector_s16_t pix1s16v, pix2s16v;
+    vector_s16_t diffv[4];
+    vector_s16_t tmpv[4];
+    vector_s16_t s01v, s23v, d01v, d23v;
+
+    /* Diff 4x8 */
+    for( i = 0; i < 4; i++ )
+    {
+        LOAD_4( pix1, pix1u8v );
+        LOAD_4( pix2, pix2u8v );
+
+        /* u8 -> s16 conversion */
+        pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
+        pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+
+        diffv[i] = vec_sub( pix1s16v, pix2s16v );
+
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    /* Hadamar H */
+    HADAMAR( diffv, tmpv );
+
+    /* Transpose */
+    TRANSPOSE4x4( tmpv, diffv );
+
+    /* Hadamar V */
+    HADAMAR( diffv, tmpv );
+
+    /* Sum of absolute values */
+    for( i = 0; i < 4; i++ )
+    {
+        satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
+    }
+    satdv = vec_sum2s( satdv, zero_s32 );
+
+    /* Done */
+    vec_ste( vec_splat( satdv, 1 ), 0, &i_satd );
+    return i_satd / 2;
+}
+
+/****************************************************************************
+ * x264_pixel_init:
+ ****************************************************************************/
+void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
+{
+    pixf->sad[PIXEL_16x16]  = pixel_sad_16x16_altivec;
+    pixf->sad[PIXEL_8x16]   = pixel_sad_8x16_altivec;
+    pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
+    pixf->sad[PIXEL_8x8]    = pixel_sad_8x8_altivec;
+
+    pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
+    pixf->satd[PIXEL_8x16]  = pixel_satd_8x16_altivec;
+    pixf->satd[PIXEL_16x8]  = pixel_satd_16x8_altivec;
+    pixf->satd[PIXEL_8x8]   = pixel_satd_8x8_altivec;
+    pixf->satd[PIXEL_4x4]   = pixel_satd_4x4_altivec;
+}
diff --git a/core/ppc/pixel.h b/core/ppc/pixel.h
new file mode 100644
index 00000000..f2d6a181
--- /dev/null
+++ b/core/ppc/pixel.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PPC_PIXEL_H
+#define _PPC_PIXEL_H 1
+
+void x264_pixel_altivec_init( x264_pixel_function_t *pixf );
+
+#endif
diff --git a/core/ppc/ppccommon.h b/core/ppc/ppccommon.h
new file mode 100644
index 00000000..15f22f65
--- /dev/null
+++ b/core/ppc/ppccommon.h
@@ -0,0 +1,158 @@
+/*****************************************************************************
+ * ppccommon.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ppccommon.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* Handy */
+#define vector_u8_t  vector unsigned char
+#define vector_s16_t vector signed short
+#define vector_u32_t vector unsigned int
+#define vector_s32_t vector signed int
+
+#define LOAD_ZERO    vector_s32_t zero = vec_splat_s32( 0 )
+#define zero_u8      (vector_u8_t)  zero
+#define zero_s16     (vector_s16_t) zero
+#define zero_s32     (vector_s32_t) zero
+
+#define CONVERT_U8_TO_S16( a ) \
+    a = (vector_s16_t) vec_mergeh( zero_u8, (vector_u8_t) a )
+
+/* Macros to load aligned or unaligned data without risking buffer
+   overflows. */
+#define LOAD_16( p, v )                                \
+    if( (int) p & 0xF )                                \
+    {                                                  \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
+                      vec_lvsl( 0, p ) );              \
+    }                                                  \
+    else                                               \
+    {                                                  \
+        v = vec_ld( 0, p );                            \
+    }
+
+#define LOAD_8( p, v )                                             \
+    if( !( (int) p & 0xF ) )                                       \
+    {                                                              \
+        v = vec_ld( 0, p );                                        \
+    }                                                              \
+    else if( ( (int) p & 0xF ) < 9 )                               \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
+                      vec_lvsl( 0, p ) );                          \
+    }                                                              \
+    else                                                           \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ),             \
+                      vec_lvsl( 0, p ) );                          \
+    }
+
+#define LOAD_4( p, v )                                             \
+    if( !( (int) p & 0xF ) )                                       \
+    {                                                              \
+        v = vec_ld( 0, p );                                        \
+    }                                                              \
+    else if( ( (int) p & 0xF ) < 13 )                              \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
+                      vec_lvsl( 0, p ) );                          \
+    }                                                              \
+    else                                                           \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ),             \
+                      vec_lvsl( 0, p ) );                          \
+    }
+
+/* Store aligned or unaligned data */
+#define STORE_16( v, p )                              \
+    if( (int) p & 0xF )                               \
+    {                                                 \
+        vector unsigned char tmp1, tmp2;              \
+        vector unsigned char align, mask;             \
+        tmp1 = vec_ld( 0, p );                        \
+        tmp2 = vec_ld( 16, p );                       \
+        align = vec_lvsr( 0, p );                     \
+        mask = vec_perm( (vector unsigned char) (0),  \
+                         (vector unsigned char) (-1), \
+                         align);                      \
+        v = vec_perm( v, v, align);                   \
+        tmp1 = vec_sel( tmp1, v, mask );              \
+        tmp2 = vec_sel( v, tmp2, mask );              \
+        vec_st( tmp1, 0, p );                         \
+        vec_st( tmp2, 16, p );                        \
+    }                                                 \
+    else                                              \
+    {                                                 \
+        vec_st( v, 0, p );                            \
+    }
+
+/* Transpose 8x8 (vector_s16_t [8]) */
+#define TRANSPOSE8x8( a, b )           \
+    b[0] = vec_mergeh( a[0], a[4] ); \
+    b[1] = vec_mergel( a[0], a[4] ); \
+    b[2] = vec_mergeh( a[1], a[5] ); \
+    b[3] = vec_mergel( a[1], a[5] ); \
+    b[4] = vec_mergeh( a[2], a[6] ); \
+    b[5] = vec_mergel( a[2], a[6] ); \
+    b[6] = vec_mergeh( a[3], a[7] ); \
+    b[7] = vec_mergel( a[3], a[7] ); \
+    a[0] = vec_mergeh( b[0], b[4] ); \
+    a[1] = vec_mergel( b[0], b[4] ); \
+    a[2] = vec_mergeh( b[1], b[5] ); \
+    a[3] = vec_mergel( b[1], b[5] ); \
+    a[4] = vec_mergeh( b[2], b[6] ); \
+    a[5] = vec_mergel( b[2], b[6] ); \
+    a[6] = vec_mergeh( b[3], b[7] ); \
+    a[7] = vec_mergel( b[3], b[7] ); \
+    b[0] = vec_mergeh( a[0], a[4] ); \
+    b[1] = vec_mergel( a[0], a[4] ); \
+    b[2] = vec_mergeh( a[1], a[5] ); \
+    b[3] = vec_mergel( a[1], a[5] ); \
+    b[4] = vec_mergeh( a[2], a[6] ); \
+    b[5] = vec_mergel( a[2], a[6] ); \
+    b[6] = vec_mergeh( a[3], a[7] ); \
+    b[7] = vec_mergel( a[3], a[7] );
+
+/* Transpose 4x4 (vector_s16_t [4]) */
+#define TRANSPOSE4x4( a, b ) \
+    (b)[0] = vec_mergeh( (a)[0], zero_s16 ); \
+    (b)[1] = vec_mergeh( (a)[1], zero_s16 ); \
+    (b)[2] = vec_mergeh( (a)[2], zero_s16 ); \
+    (b)[3] = vec_mergeh( (a)[3], zero_s16 ); \
+    (a)[0] = vec_mergeh( (b)[0], (b)[2] );   \
+    (a)[1] = vec_mergel( (b)[0], (b)[2] );   \
+    (a)[2] = vec_mergeh( (b)[1], (b)[3] );   \
+    (a)[3] = vec_mergel( (b)[1], (b)[3] );   \
+    (b)[0] = vec_mergeh( (a)[0], (a)[2] );   \
+    (b)[1] = vec_mergel( (a)[0], (a)[2] );   \
+    (b)[2] = vec_mergeh( (a)[1], (a)[3] );   \
+    (b)[3] = vec_mergel( (a)[1], (a)[3] );
+
+/* Hadamar (vector_s16_t [4]) */
+#define HADAMAR( a, b ) \
+    s01v   = vec_add( (a)[0], (a)[1] ); \
+    s23v   = vec_add( (a)[2], (a)[3] ); \
+    d01v   = vec_sub( (a)[0], (a)[1] ); \
+    d23v   = vec_sub( (a)[2], (a)[3] ); \
+    (b)[0] = vec_add( s01v, s23v );     \
+    (b)[1] = vec_sub( s01v, s23v );     \
+    (b)[2] = vec_sub( d01v, d23v );     \
+    (b)[3] = vec_add( d01v, d23v );
+
diff --git a/core/predict.c b/core/predict.c
new file mode 100644
index 00000000..6a799a09
--- /dev/null
+++ b/core/predict.c
@@ -0,0 +1,697 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* XXX predict4x4 are inspired from ffmpeg h264 decoder
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "predict.h"
+
+#ifdef _MSC_VER
+#undef HAVE_MMXEXT  /* not finished now */
+#endif
+#ifdef HAVE_MMXEXT
+#   include "i386/predict.h"
+#endif
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/****************************************************************************
+ * 16x16 prediction for intra block DC, H, V, P
+ ****************************************************************************/
+static void predict_16x16_dc( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i, j;
+
+    /* calculate DC value */
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+        dc += src[i - i_stride];
+    }
+    dc = ( dc + 16 ) >> 5;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_left( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+    }
+    dc = ( dc + 8 ) >> 4;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_top( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[i - i_stride];
+    }
+    dc = ( dc + 8 ) >> 4;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_128( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+
+    }
+}
+static void predict_16x16_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+static void predict_16x16_p( uint8_t *src, int i_stride )
+{
+    int x, y, i;
+    int a, b, c;
+    int H = 0;
+    int V = 0;
+    int i00;
+
+    /* calcule H and V */
+    for( i = 0; i <= 7; i++ )
+    {
+        H += ( i + 1 ) * ( src[ 8 + i - i_stride ] - src[6 -i -i_stride] );
+        V += ( i + 1 ) * ( src[-1 + (8+i)*i_stride] - src[-1 + (6-i)*i_stride] );
+    }
+
+    a = 16 * ( src[-1 + 15*i_stride] + src[15 - i_stride] );
+    b = ( 5 * H + 32 ) >> 6;
+    c = ( 5 * V + 32 ) >> 6;
+
+    i00 = a - b * 7 - c * 7 + 16;
+
+    for( y = 0; y < 16; y++ )
+    {
+        for( x = 0; x < 16; x++ )
+        {
+            int pix;
+
+            pix = (i00+b*x)>>5;
+
+            src[x] = clip_uint8( pix );
+        }
+        src += i_stride;
+        i00 += c;
+    }
+}
+
+
+/****************************************************************************
+ * 8x8 prediction for intra chroma block DC, H, V, P
+ ****************************************************************************/
+static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+{
+    int x,y;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            src[x] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc0 = 0, dc1 = 0;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dc0 += src[y * i_stride     - 1];
+        dc1 += src[(y+4) * i_stride - 1];
+    }
+    dc0 = ( dc0 + 2 ) >> 2;
+    dc1 = ( dc1 + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            src[           x] = dc0;
+            src[4*i_stride+x] = dc1;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc0 = 0, dc1 = 0;
+
+    for( x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - i_stride];
+        dc1 += src[x + 4 - i_stride];
+    }
+    dc0 = ( dc0 + 2 ) >> 2;
+    dc1 = ( dc1 + 2 ) >> 2;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x    ] = dc0;
+            src[x + 4] = dc1;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    int dc0, dc1, dc2, dc3;
+    int i;
+
+    /* First do :
+          s0 s1
+       s2
+       s3
+    */
+    for( i = 0; i < 4; i++ )
+    {
+        s0 += src[i - i_stride];
+        s1 += src[i + 4 - i_stride];
+        s2 += src[-1 + i * i_stride];
+        s3 += src[-1 + (i+4)*i_stride];
+    }
+    /* now calculate
+       dc0 dc1
+       dc2 dc3
+     */
+    dc0 = ( s0 + s2 + 4 ) >> 3;
+    dc1 = ( s1 + 2 ) >> 2;
+    dc2 = ( s3 + 2 ) >> 2;
+    dc3 = ( s1 + s3 + 4 ) >> 3;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[             x    ] = dc0;
+            src[             x + 4] = dc1;
+            src[4*i_stride + x    ] = dc2;
+            src[4*i_stride + x + 4] = dc3;
+        }
+        src += i_stride;
+    }
+}
+
+static void predict_8x8_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 8; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+
+        for( j = 0; j < 8; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 8; i++ )
+    {
+        for( j = 0; j < 8; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+
+static void predict_8x8_p( uint8_t *src, int i_stride )
+{
+    int i;
+    int x,y;
+    int a, b, c;
+    int H = 0;
+    int V = 0;
+    int i00;
+
+    for( i = 0; i < 4; i++ )
+    {
+        H += ( i + 1 ) * ( src[4+i - i_stride] - src[2 - i -i_stride] );
+        V += ( i + 1 ) * ( src[-1 +(i+4)*i_stride] - src[-1+(2-i)*i_stride] );
+    }
+
+    a = 16 * ( src[-1+7*i_stride] + src[7 - i_stride] );
+    b = ( 17 * H + 16 ) >> 5;
+    c = ( 17 * V + 16 ) >> 5;
+    i00 = a -3*b -3*c + 16;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            int pix;
+
+            pix = (i00 +b*x) >> 5;
+            src[x] = clip_uint8( pix );
+        }
+        src += i_stride;
+        i00 += c;
+    }
+}
+
+/****************************************************************************
+ * 4x4 prediction for intra luma block DC, H, V, P
+ ****************************************************************************/
+static void predict_4x4_dc_128( uint8_t *src, int i_stride )
+{
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_left( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[-1+0*i_stride] + src[-1+i_stride]+
+               src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_top( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[0 - i_stride] + src[1 - i_stride] +
+               src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[-1+0*i_stride] + src[-1+i_stride]+
+               src[-1+2*i_stride] + src[-1+3*i_stride] +
+               src[0 - i_stride]  + src[1 - i_stride] +
+               src[2 - i_stride]  + src[3 - i_stride] + 4 ) >> 3;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+
+        for( j = 0; j < 4; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 4; i++ )
+    {
+        for( j = 0; j < 4; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+
+#define PREDICT_4x4_LOAD_LEFT \
+    const int l0 = src[-1+0*i_stride];   \
+    const int l1 = src[-1+1*i_stride];   \
+    const int l2 = src[-1+2*i_stride];   \
+    const int l3 = src[-1+3*i_stride];
+
+#define PREDICT_4x4_LOAD_TOP \
+    const int t0 = src[0-1*i_stride];   \
+    const int t1 = src[1-1*i_stride];   \
+    const int t2 = src[2-1*i_stride];   \
+    const int t3 = src[3-1*i_stride];
+
+#define PREDICT_4x4_LOAD_TOP_RIGHT \
+    const int t4 = src[4-1*i_stride];   \
+    const int t5 = src[5-1*i_stride];   \
+    const int t6 = src[6-1*i_stride];   \
+    const int t7 = src[7-1*i_stride];
+
+
+static void predict_4x4_ddl( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_TOP
+    PREDICT_4x4_LOAD_TOP_RIGHT
+
+    src[0*i_stride+0] = ( t0 + 2*t1+ t2 + 2 ) >> 2;
+
+    src[0*i_stride+1] =
+    src[1*i_stride+0] = ( t1 + 2*t2+ t3 + 2 ) >> 2;
+
+    src[0*i_stride+2] =
+    src[1*i_stride+1] =
+    src[2*i_stride+0] = ( t2 + 2*t3+ t4 + 2 ) >> 2;
+
+    src[0*i_stride+3] =
+    src[1*i_stride+2] =
+    src[2*i_stride+1] =
+    src[3*i_stride+0] = ( t3 + 2*t4+ t5 + 2 ) >> 2;
+
+    src[1*i_stride+3] =
+    src[2*i_stride+2] =
+    src[3*i_stride+1] = ( t4 + 2*t5+ t6 + 2 ) >> 2;
+
+    src[2*i_stride+3] =
+    src[3*i_stride+2] = ( t5 + 2*t6+ t7 + 2 ) >> 2;
+
+    src[3*i_stride+3] = ( t6 + 3 * t7 + 2 ) >> 2;
+}
+static void predict_4x4_ddr( uint8_t *src, int i_stride )
+{
+    const int lt = src[-1-i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+
+    src[0*i_stride+0] =
+    src[1*i_stride+1] =
+    src[2*i_stride+2] =
+    src[3*i_stride+3] = ( t0 + 2*lt +l0 + 2 ) >> 2;
+
+    src[0*i_stride+1] =
+    src[1*i_stride+2] =
+    src[2*i_stride+3] = ( lt + 2 * t0 + t1 + 2 ) >> 2;
+
+    src[0*i_stride+2] =
+    src[1*i_stride+3] = ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+
+    src[0*i_stride+3] = ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+
+    src[1*i_stride+0] =
+    src[2*i_stride+1] =
+    src[3*i_stride+2] = ( lt + 2 * l0 + l1 + 2 ) >> 2;
+
+    src[2*i_stride+0] =
+    src[3*i_stride+1] = ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+
+    src[3*i_stride+0] = ( l1 + 2 * l2 + l3 + 2 ) >> 2;
+}
+
+static void predict_4x4_vr( uint8_t *src, int i_stride )
+{
+    const int lt = src[-1-i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+    /* produce warning as l3 is unused */
+
+    src[0*i_stride+0]=
+    src[2*i_stride+1]= ( lt + t0 + 1 ) >> 1;
+
+    src[0*i_stride+1]=
+    src[2*i_stride+2]= ( t0 + t1 + 1 ) >> 1;
+
+    src[0*i_stride+2]=
+    src[2*i_stride+3]= ( t1 + t2 + 1 ) >> 1;
+
+    src[0*i_stride+3]= ( t2 + t3 + 1 ) >> 1;
+
+    src[1*i_stride+0]=
+    src[3*i_stride+1]= ( l0 + 2 * lt + t0 + 2 ) >> 2;
+
+    src[1*i_stride+1]=
+    src[3*i_stride+2]= ( lt + 2 * t0 + t1 + 2 ) >> 2;
+
+    src[1*i_stride+2]=
+    src[3*i_stride+3]= ( t0 + 2 * t1 + t2 + 2) >> 2;
+
+    src[1*i_stride+3]= ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+    src[2*i_stride+0]= ( lt + 2 * l0 + l1 + 2 ) >> 2;
+    src[3*i_stride+0]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+}
+
+static void predict_4x4_hd( uint8_t *src, int i_stride )
+{
+    const int lt= src[-1-1*i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+    /* produce warning as t3 is unused */
+
+    src[0*i_stride+0]=
+    src[1*i_stride+2]= ( lt + l0 + 1 ) >> 1;
+    src[0*i_stride+1]=
+    src[1*i_stride+3]= ( l0 + 2 * lt + t0 + 2 ) >> 2;
+    src[0*i_stride+2]= ( lt + 2 * t0 + t1 + 2 ) >> 2;
+    src[0*i_stride+3]= ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+    src[1*i_stride+0]=
+    src[2*i_stride+2]= ( l0 + l1 + 1 ) >> 1;
+    src[1*i_stride+1]=
+    src[2*i_stride+3]= ( lt + 2 * l0 + l1 + 2 ) >> 2;
+    src[2*i_stride+0]=
+    src[3*i_stride+2]= ( l1 + l2+ 1 ) >> 1;
+    src[2*i_stride+1]=
+    src[3*i_stride+3]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+    src[3*i_stride+0]= ( l2 + l3 + 1 ) >> 1;
+    src[3*i_stride+1]= ( l1 + 2 * l2 + l3 + 2 ) >> 2;
+}
+
+static void predict_4x4_vl( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_TOP
+    PREDICT_4x4_LOAD_TOP_RIGHT
+    /* produce warning as t7 is unused */
+
+    src[0*i_stride+0]= ( t0 + t1 + 1 ) >> 1;
+    src[0*i_stride+1]=
+    src[2*i_stride+0]= ( t1 + t2 + 1 ) >> 1;
+    src[0*i_stride+2]=
+    src[2*i_stride+1]= ( t2 + t3 + 1 ) >> 1;
+    src[0*i_stride+3]=
+    src[2*i_stride+2]= ( t3 + t4+ 1 ) >> 1;
+    src[2*i_stride+3]= ( t4 + t5+ 1 ) >> 1;
+    src[1*i_stride+0]= ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+    src[1*i_stride+1]=
+    src[3*i_stride+0]= ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+    src[1*i_stride+2]=
+    src[3*i_stride+1]= ( t2 + 2 * t3 + t4 + 2 ) >> 2;
+    src[1*i_stride+3]=
+    src[3*i_stride+2]= ( t3 + 2 * t4 + t5 + 2 ) >> 2;
+    src[3*i_stride+3]= ( t4 + 2 * t5 + t6 + 2 ) >> 2;
+}
+
+static void predict_4x4_hu( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_LEFT
+
+    src[0*i_stride+0]= ( l0 + l1 + 1 ) >> 1;
+    src[0*i_stride+1]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+
+    src[0*i_stride+2]=
+    src[1*i_stride+0]= ( l1 + l2 + 1 ) >> 1;
+
+    src[0*i_stride+3]=
+    src[1*i_stride+1]= ( l1 + 2*l2 + l3 + 2 ) >> 2;
+
+    src[1*i_stride+2]=
+    src[2*i_stride+0]= ( l2 + l3 + 1 ) >> 1;
+
+    src[1*i_stride+3]=
+    src[2*i_stride+1]= ( l2 + 2 * l3 + l3 + 2 ) >> 2;
+
+    src[2*i_stride+3]=
+    src[3*i_stride+1]=
+    src[3*i_stride+0]=
+    src[2*i_stride+2]=
+    src[3*i_stride+2]=
+    src[3*i_stride+3]= l3;
+}
+
+/****************************************************************************
+ * Exported functions:
+ ****************************************************************************/
+void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
+{
+    pf[I_PRED_16x16_V ]     = predict_16x16_v;
+    pf[I_PRED_16x16_H ]     = predict_16x16_h;
+    pf[I_PRED_16x16_DC]     = predict_16x16_dc;
+    pf[I_PRED_16x16_P ]     = predict_16x16_p;
+    pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
+    pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top;
+    pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_16x16_init_mmxext( pf );
+    }
+#endif
+}
+
+void x264_predict_8x8_init( int cpu, x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = predict_8x8_v;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
+    pf[I_PRED_CHROMA_P ]     = predict_8x8_p;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_8x8_init_mmxext( pf );
+    }
+#endif
+}
+
+void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
+{
+    pf[I_PRED_4x4_V]      = predict_4x4_v;
+    pf[I_PRED_4x4_H]      = predict_4x4_h;
+    pf[I_PRED_4x4_DC]     = predict_4x4_dc;
+    pf[I_PRED_4x4_DDL]    = predict_4x4_ddl;
+    pf[I_PRED_4x4_DDR]    = predict_4x4_ddr;
+    pf[I_PRED_4x4_VR]     = predict_4x4_vr;
+    pf[I_PRED_4x4_HD]     = predict_4x4_hd;
+    pf[I_PRED_4x4_VL]     = predict_4x4_vl;
+    pf[I_PRED_4x4_HU]     = predict_4x4_hu;
+    pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left;
+    pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top;
+    pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_4x4_init_mmxext( pf );
+    }
+#endif
+}
+
diff --git a/core/predict.h b/core/predict.h
new file mode 100644
index 00000000..988e57fb
--- /dev/null
+++ b/core/predict.h
@@ -0,0 +1,92 @@
+/*****************************************************************************
+ * predict.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PREDICT_H
+#define _PREDICT_H 1
+
+typedef void (*x264_predict_t)( uint8_t *src, int i_stride );
+
+enum intra_chroma_pred_e
+{
+    I_PRED_CHROMA_DC = 0,
+    I_PRED_CHROMA_H  = 1,
+    I_PRED_CHROMA_V  = 2,
+    I_PRED_CHROMA_P  = 3,
+
+    I_PRED_CHROMA_DC_LEFT = 4,
+    I_PRED_CHROMA_DC_TOP  = 5,
+    I_PRED_CHROMA_DC_128  = 6
+};
+static const int x264_mb_pred_mode8x8_fix[7] =
+{
+    I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
+    I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
+};
+
+enum intra16x16_pred_e
+{
+    I_PRED_16x16_V  = 0,
+    I_PRED_16x16_H  = 1,
+    I_PRED_16x16_DC = 2,
+    I_PRED_16x16_P  = 3,
+
+    I_PRED_16x16_DC_LEFT = 4,
+    I_PRED_16x16_DC_TOP  = 5,
+    I_PRED_16x16_DC_128  = 6,
+};
+static const int x264_mb_pred_mode16x16_fix[7] =
+{
+    I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P,
+    I_PRED_16x16_DC,I_PRED_16x16_DC,I_PRED_16x16_DC
+};
+
+enum intra4x4_pred_e
+{
+    I_PRED_4x4_V  = 0,
+    I_PRED_4x4_H  = 1,
+    I_PRED_4x4_DC = 2,
+    I_PRED_4x4_DDL= 3,
+    I_PRED_4x4_DDR= 4,
+    I_PRED_4x4_VR = 5,
+    I_PRED_4x4_HD = 6,
+    I_PRED_4x4_VL = 7,
+    I_PRED_4x4_HU = 8,
+
+    I_PRED_4x4_DC_LEFT = 9,
+    I_PRED_4x4_DC_TOP  = 10,
+    I_PRED_4x4_DC_128  = 11,
+};
+static const int x264_mb_pred_mode4x4_fix[12] =
+{
+    I_PRED_4x4_V,   I_PRED_4x4_H,   I_PRED_4x4_DC,
+    I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR,
+    I_PRED_4x4_HD,  I_PRED_4x4_VL,  I_PRED_4x4_HU,
+    I_PRED_4x4_DC,  I_PRED_4x4_DC,  I_PRED_4x4_DC
+};
+
+void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x8_init   ( int cpu, x264_predict_t pf[7] );
+void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
+
+
+#endif
diff --git a/core/set.h b/core/set.h
new file mode 100644
index 00000000..bfd75e84
--- /dev/null
+++ b/core/set.h
@@ -0,0 +1,123 @@
+/*****************************************************************************
+ * set.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _SET_H
+#define _SET_H 1
+
+enum profile_e
+{
+    PROFILE_BASELINE = 66,
+    PROFILE_MAIN = 77,
+    PROFILE_EXTENTED = 88
+};
+
+typedef struct
+{
+    int i_id;
+
+    int i_profile_idc;
+    int i_level_idc;
+
+    int b_constraint_set0;
+    int b_constraint_set1;
+    int b_constraint_set2;
+
+    int i_log2_max_frame_num;
+
+    int i_poc_type;
+    /* poc 0 */
+    int i_log2_max_poc_lsb;
+    /* poc 1 */
+    int b_delta_pic_order_always_zero;
+    int i_offset_for_non_ref_pic;
+    int i_offset_for_top_to_bottom_field;
+    int i_num_ref_frames_in_poc_cycle;
+    int i_offset_for_ref_frame[256];
+
+    int i_num_ref_frames;
+    int b_gaps_in_frame_num_value_allowed;
+    int i_mb_width;
+    int i_mb_height;
+    int b_frame_mbs_only;
+    int b_mb_adaptive_frame_field;
+    int b_direct8x8_inference;
+
+    int b_crop;
+    struct
+    {
+        int i_left;
+        int i_right;
+        int i_top;
+        int i_bottom;
+    } crop;
+
+    int b_vui;
+    struct
+    {
+        int i_sar_width;
+        int i_sar_height;
+        /* FIXME to complete */
+    } vui;
+
+} x264_sps_t;
+
+typedef struct
+{
+    int i_id;
+    int i_sps_id;
+
+    int b_cabac;
+
+    int b_pic_order;
+    int i_num_slice_groups;
+
+    int i_slice_group_map_type;
+    /* i_slice_group_map_type == 0 */
+    int i_run_length[256];      /* FIXME */
+    /* i_slice_group_map_type == 2 */
+    int i_top_left[256];        /* FIXME */
+    int i_bottom_right[256];    /* FIXME */
+    /* i_slice_group_map_type == 3, 4, 5 */
+    int b_slice_group_change_direction;
+    int i_slice_group_change_rate;
+    /* i_slice_group_map_type == 6 */
+    int i_pic_size_in_map_units;
+    int i_slice_group_id[256];  /* FIXME */
+
+    int i_num_ref_idx_l0_active;
+    int i_num_ref_idx_l1_active;
+
+    int b_weighted_pred;
+    int b_weighted_bipred;
+
+    int i_pic_init_qp;
+    int i_pic_init_qs;
+
+    int i_chroma_qp_index_offset;
+
+    int b_deblocking_filter_control;
+    int b_constrained_intra_pred;
+    int b_redundant_pic_cnt;
+} x264_pps_t;
+
+#endif
diff --git a/core/vlc.h b/core/vlc.h
new file mode 100644
index 00000000..45779435
--- /dev/null
+++ b/core/vlc.h
@@ -0,0 +1,914 @@
+/*****************************************************************************
+ * vlc.h : vlc table
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+typedef struct
+{
+    int i_bits;
+    int i_size;
+} vlc_t;
+
+/* XXX: don't forget to change it if you change vlc_t */
+#define MKVLC( a, b ) { a, b }
+static const vlc_t x264_coeff_token[5][17*4] =
+{
+    /* table 0 */
+    {
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 8 ), /* str=00000111 */
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x6, 8 ), /* str=00000110 */
+        MKVLC( 0x5, 7 ), /* str=0000101 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+
+        MKVLC( 0x7, 10 ), /* str=0000000111 */
+        MKVLC( 0x6, 9 ), /* str=000000110 */
+        MKVLC( 0x5, 8 ), /* str=00000101 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+
+        MKVLC( 0x7, 11 ), /* str=00000000111 */
+        MKVLC( 0x6, 10 ), /* str=0000000110 */
+        MKVLC( 0x5, 9 ), /* str=000000101 */
+        MKVLC( 0x4, 7 ), /* str=0000100 */
+
+        MKVLC( 0xf, 13 ), /* str=0000000001111 */
+        MKVLC( 0x6, 11 ), /* str=00000000110 */
+        MKVLC( 0x5, 10 ), /* str=0000000101 */
+        MKVLC( 0x4, 8 ), /* str=00000100 */
+
+        MKVLC( 0xb, 13 ), /* str=0000000001011 */
+        MKVLC( 0xe, 13 ), /* str=0000000001110 */
+        MKVLC( 0x5, 11 ), /* str=00000000101 */
+        MKVLC( 0x4, 9 ), /* str=000000100 */
+
+        MKVLC( 0x8, 13 ), /* str=0000000001000 */
+        MKVLC( 0xa, 13 ), /* str=0000000001010 */
+        MKVLC( 0xd, 13 ), /* str=0000000001101 */
+        MKVLC( 0x4, 10 ), /* str=0000000100 */
+
+        MKVLC( 0xf, 14 ), /* str=00000000001111 */
+        MKVLC( 0xe, 14 ), /* str=00000000001110 */
+        MKVLC( 0x9, 13 ), /* str=0000000001001 */
+        MKVLC( 0x4, 11 ), /* str=00000000100 */
+
+        MKVLC( 0xb, 14 ), /* str=00000000001011 */
+        MKVLC( 0xa, 14 ), /* str=00000000001010 */
+        MKVLC( 0xd, 14 ), /* str=00000000001101 */
+        MKVLC( 0xc, 13 ), /* str=0000000001100 */
+
+        MKVLC( 0xf, 15 ), /* str=000000000001111 */
+        MKVLC( 0xe, 15 ), /* str=000000000001110 */
+        MKVLC( 0x9, 14 ), /* str=00000000001001 */
+        MKVLC( 0xc, 14 ), /* str=00000000001100 */
+
+        MKVLC( 0xb, 15 ), /* str=000000000001011 */
+        MKVLC( 0xa, 15 ), /* str=000000000001010 */
+        MKVLC( 0xd, 15 ), /* str=000000000001101 */
+        MKVLC( 0x8, 14 ), /* str=00000000001000 */
+
+        MKVLC( 0xf, 16 ), /* str=0000000000001111 */
+        MKVLC( 0x1, 15 ), /* str=000000000000001 */
+        MKVLC( 0x9, 15 ), /* str=000000000001001 */
+        MKVLC( 0xc, 15 ), /* str=000000000001100 */
+
+        MKVLC( 0xb, 16 ), /* str=0000000000001011 */
+        MKVLC( 0xe, 16 ), /* str=0000000000001110 */
+        MKVLC( 0xd, 16 ), /* str=0000000000001101 */
+        MKVLC( 0x8, 15 ), /* str=000000000001000 */
+
+        MKVLC( 0x7, 16 ), /* str=0000000000000111 */
+        MKVLC( 0xa, 16 ), /* str=0000000000001010 */
+        MKVLC( 0x9, 16 ), /* str=0000000000001001 */
+        MKVLC( 0xc, 16 ), /* str=0000000000001100 */
+
+        MKVLC( 0x4, 16 ), /* str=0000000000000100 */
+        MKVLC( 0x6, 16 ), /* str=0000000000000110 */
+        MKVLC( 0x5, 16 ), /* str=0000000000000101 */
+        MKVLC( 0x8, 16 ), /* str=0000000000001000 */
+    },
+
+    /* table 1 */
+    {
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xb, 6 ), /* str=001011 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 6 ), /* str=000111 */
+        MKVLC( 0x7, 5 ), /* str=00111 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 7 ), /* str=0000111 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+
+        MKVLC( 0x7, 8 ), /* str=00000111 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+
+        MKVLC( 0x4, 8 ), /* str=00000100 */
+        MKVLC( 0x6, 7 ), /* str=0000110 */
+        MKVLC( 0x5, 7 ), /* str=0000101 */
+        MKVLC( 0x6, 5 ), /* str=00110 */
+
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x6, 8 ), /* str=00000110 */
+        MKVLC( 0x5, 8 ), /* str=00000101 */
+        MKVLC( 0x8, 6 ), /* str=001000 */
+
+        MKVLC( 0xf, 11 ), /* str=00000001111 */
+        MKVLC( 0x6, 9 ), /* str=000000110 */
+        MKVLC( 0x5, 9 ), /* str=000000101 */
+        MKVLC( 0x4, 6 ), /* str=000100 */
+
+        MKVLC( 0xb, 11 ), /* str=00000001011 */
+        MKVLC( 0xe, 11 ), /* str=00000001110 */
+        MKVLC( 0xd, 11 ), /* str=00000001101 */
+        MKVLC( 0x4, 7 ), /* str=0000100 */
+
+        MKVLC( 0xf, 12 ), /* str=000000001111 */
+        MKVLC( 0xa, 11 ), /* str=00000001010 */
+        MKVLC( 0x9, 11 ), /* str=00000001001 */
+        MKVLC( 0x4, 9 ), /* str=000000100 */
+
+        MKVLC( 0xb, 12 ), /* str=000000001011 */
+        MKVLC( 0xe, 12 ), /* str=000000001110 */
+        MKVLC( 0xd, 12 ), /* str=000000001101 */
+        MKVLC( 0xc, 11 ), /* str=00000001100 */
+
+        MKVLC( 0x8, 12 ), /* str=000000001000 */
+        MKVLC( 0xa, 12 ), /* str=000000001010 */
+        MKVLC( 0x9, 12 ), /* str=000000001001 */
+        MKVLC( 0x8, 11 ), /* str=00000001000 */
+
+        MKVLC( 0xf, 13 ), /* str=0000000001111 */
+        MKVLC( 0xe, 13 ), /* str=0000000001110 */
+        MKVLC( 0xd, 13 ), /* str=0000000001101 */
+        MKVLC( 0xc, 12 ), /* str=000000001100 */
+
+        MKVLC( 0xb, 13 ), /* str=0000000001011 */
+        MKVLC( 0xa, 13 ), /* str=0000000001010 */
+        MKVLC( 0x9, 13 ), /* str=0000000001001 */
+        MKVLC( 0xc, 13 ), /* str=0000000001100 */
+
+        MKVLC( 0x7, 13 ), /* str=0000000000111 */
+        MKVLC( 0xb, 14 ), /* str=00000000001011 */
+        MKVLC( 0x6, 13 ), /* str=0000000000110 */
+        MKVLC( 0x8, 13 ), /* str=0000000001000 */
+
+        MKVLC( 0x9, 14 ), /* str=00000000001001 */
+        MKVLC( 0x8, 14 ), /* str=00000000001000 */
+        MKVLC( 0xa, 14 ), /* str=00000000001010 */
+        MKVLC( 0x1, 13 ), /* str=0000000000001 */
+
+        MKVLC( 0x7, 14 ), /* str=00000000000111 */
+        MKVLC( 0x6, 14 ), /* str=00000000000110 */
+        MKVLC( 0x5, 14 ), /* str=00000000000101 */
+        MKVLC( 0x4, 14 ), /* str=00000000000100 */
+    },
+    /* table 2 */
+    {
+        MKVLC( 0xf, 4 ), /* str=1111 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xf, 6 ), /* str=001111 */
+        MKVLC( 0xe, 4 ), /* str=1110 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xb, 6 ), /* str=001011 */
+        MKVLC( 0xf, 5 ), /* str=01111 */
+        MKVLC( 0xd, 4 ), /* str=1101 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x8, 6 ), /* str=001000 */
+        MKVLC( 0xc, 5 ), /* str=01100 */
+        MKVLC( 0xe, 5 ), /* str=01110 */
+        MKVLC( 0xc, 4 ), /* str=1100 */
+
+        MKVLC( 0xf, 7 ), /* str=0001111 */
+        MKVLC( 0xa, 5 ), /* str=01010 */
+        MKVLC( 0xb, 5 ), /* str=01011 */
+        MKVLC( 0xb, 4 ), /* str=1011 */
+
+        MKVLC( 0xb, 7 ), /* str=0001011 */
+        MKVLC( 0x8, 5 ), /* str=01000 */
+        MKVLC( 0x9, 5 ), /* str=01001 */
+        MKVLC( 0xa, 4 ), /* str=1010 */
+
+        MKVLC( 0x9, 7 ), /* str=0001001 */
+        MKVLC( 0xe, 6 ), /* str=001110 */
+        MKVLC( 0xd, 6 ), /* str=001101 */
+        MKVLC( 0x9, 4 ), /* str=1001 */
+
+        MKVLC( 0x8, 7 ), /* str=0001000 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0x8, 4 ), /* str=1000 */
+
+        MKVLC( 0xf, 8 ), /* str=00001111 */
+        MKVLC( 0xe, 7 ), /* str=0001110 */
+        MKVLC( 0xd, 7 ), /* str=0001101 */
+        MKVLC( 0xd, 5 ), /* str=01101 */
+
+        MKVLC( 0xb, 8 ), /* str=00001011 */
+        MKVLC( 0xe, 8 ), /* str=00001110 */
+        MKVLC( 0xa, 7 ), /* str=0001010 */
+        MKVLC( 0xc, 6 ), /* str=001100 */
+
+        MKVLC( 0xf, 9 ), /* str=000001111 */
+        MKVLC( 0xa, 8 ), /* str=00001010 */
+        MKVLC( 0xd, 8 ), /* str=00001101 */
+        MKVLC( 0xc, 7 ), /* str=0001100 */
+
+        MKVLC( 0xb, 9 ), /* str=000001011 */
+        MKVLC( 0xe, 9 ), /* str=000001110 */
+        MKVLC( 0x9, 8 ), /* str=00001001 */
+        MKVLC( 0xc, 8 ), /* str=00001100 */
+
+        MKVLC( 0x8, 9 ), /* str=000001000 */
+        MKVLC( 0xa, 9 ), /* str=000001010 */
+        MKVLC( 0xd, 9 ), /* str=000001101 */
+        MKVLC( 0x8, 8 ), /* str=00001000 */
+
+        MKVLC( 0xd, 10 ), /* str=0000001101 */
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x9, 9 ), /* str=000001001 */
+        MKVLC( 0xc, 9 ), /* str=000001100 */
+
+        MKVLC( 0x9, 10 ), /* str=0000001001 */
+        MKVLC( 0xc, 10 ), /* str=0000001100 */
+        MKVLC( 0xb, 10 ), /* str=0000001011 */
+        MKVLC( 0xa, 10 ), /* str=0000001010 */
+
+        MKVLC( 0x5, 10 ), /* str=0000000101 */
+        MKVLC( 0x8, 10 ), /* str=0000001000 */
+        MKVLC( 0x7, 10 ), /* str=0000000111 */
+        MKVLC( 0x6, 10 ), /* str=0000000110 */
+
+        MKVLC( 0x1, 10 ), /* str=0000000001 */
+        MKVLC( 0x4, 10 ), /* str=0000000100 */
+        MKVLC( 0x3, 10 ), /* str=0000000011 */
+        MKVLC( 0x2, 10 ), /* str=0000000010 */
+    },
+
+    /* table 3 */
+    {
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x8, 6 ), /* str=001000 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0xb, 6 ), /* str=001011 */
+
+        MKVLC( 0xc, 6 ), /* str=001100 */
+        MKVLC( 0xd, 6 ), /* str=001101 */
+        MKVLC( 0xe, 6 ), /* str=001110 */
+        MKVLC( 0xf, 6 ), /* str=001111 */
+
+        MKVLC( 0x10, 6 ), /* str=010000 */
+        MKVLC( 0x11, 6 ), /* str=010001 */
+        MKVLC( 0x12, 6 ), /* str=010010 */
+        MKVLC( 0x13, 6 ), /* str=010011 */
+
+        MKVLC( 0x14, 6 ), /* str=010100 */
+        MKVLC( 0x15, 6 ), /* str=010101 */
+        MKVLC( 0x16, 6 ), /* str=010110 */
+        MKVLC( 0x17, 6 ), /* str=010111 */
+
+        MKVLC( 0x18, 6 ), /* str=011000 */
+        MKVLC( 0x19, 6 ), /* str=011001 */
+        MKVLC( 0x1a, 6 ), /* str=011010 */
+        MKVLC( 0x1b, 6 ), /* str=011011 */
+
+        MKVLC( 0x1c, 6 ), /* str=011100 */
+        MKVLC( 0x1d, 6 ), /* str=011101 */
+        MKVLC( 0x1e, 6 ), /* str=011110 */
+        MKVLC( 0x1f, 6 ), /* str=011111 */
+
+        MKVLC( 0x20, 6 ), /* str=100000 */
+        MKVLC( 0x21, 6 ), /* str=100001 */
+        MKVLC( 0x22, 6 ), /* str=100010 */
+        MKVLC( 0x23, 6 ), /* str=100011 */
+
+        MKVLC( 0x24, 6 ), /* str=100100 */
+        MKVLC( 0x25, 6 ), /* str=100101 */
+        MKVLC( 0x26, 6 ), /* str=100110 */
+        MKVLC( 0x27, 6 ), /* str=100111 */
+
+        MKVLC( 0x28, 6 ), /* str=101000 */
+        MKVLC( 0x29, 6 ), /* str=101001 */
+        MKVLC( 0x2a, 6 ), /* str=101010 */
+        MKVLC( 0x2b, 6 ), /* str=101011 */
+
+        MKVLC( 0x2c, 6 ), /* str=101100 */
+        MKVLC( 0x2d, 6 ), /* str=101101 */
+        MKVLC( 0x2e, 6 ), /* str=101110 */
+        MKVLC( 0x2f, 6 ), /* str=101111 */
+
+        MKVLC( 0x30, 6 ), /* str=110000 */
+        MKVLC( 0x31, 6 ), /* str=110001 */
+        MKVLC( 0x32, 6 ), /* str=110010 */
+        MKVLC( 0x33, 6 ), /* str=110011 */
+
+        MKVLC( 0x34, 6 ), /* str=110100 */
+        MKVLC( 0x35, 6 ), /* str=110101 */
+        MKVLC( 0x36, 6 ), /* str=110110 */
+        MKVLC( 0x37, 6 ), /* str=110111 */
+
+        MKVLC( 0x38, 6 ), /* str=111000 */
+        MKVLC( 0x39, 6 ), /* str=111001 */
+        MKVLC( 0x3a, 6 ), /* str=111010 */
+        MKVLC( 0x3b, 6 ), /* str=111011 */
+
+        MKVLC( 0x3c, 6 ), /* str=111100 */
+        MKVLC( 0x3d, 6 ), /* str=111101 */
+        MKVLC( 0x3e, 6 ), /* str=111110 */
+        MKVLC( 0x3f, 6 ), /* str=111111 */
+    },
+
+    /* table 4 */
+    {
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 6 ), /* str=000111 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x3, 7 ), /* str=0000011 */
+        MKVLC( 0x2, 7 ), /* str=0000010 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x3, 8 ), /* str=00000011 */
+        MKVLC( 0x2, 8 ), /* str=00000010 */
+        MKVLC( 0x0, 7 ), /* str=0000000 */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    }
+};
+
+static const vlc_t x264_level_prefix[16] =
+{
+    MKVLC( 0x01,  1 ),
+    MKVLC( 0x01,  2 ),
+    MKVLC( 0x01,  3 ),
+    MKVLC( 0x01,  4 ),
+    MKVLC( 0x01,  5 ),
+    MKVLC( 0x01,  6 ),
+    MKVLC( 0x01,  7 ),
+    MKVLC( 0x01,  8 ),
+    MKVLC( 0x01,  9 ),
+    MKVLC( 0x01, 10 ),
+    MKVLC( 0x01, 11 ),
+    MKVLC( 0x01, 12 ),
+    MKVLC( 0x01, 13 ),
+    MKVLC( 0x01, 14 ),
+    MKVLC( 0x01, 15 ),
+    MKVLC( 0x01, 16 )
+};
+
+/* [i_total_coeff-1][i_total_zeros] */
+static const vlc_t x264_total_zeros[15][16] =
+{
+    { /* i_total 1 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x3, 7 ), /* str=0000011 */
+        MKVLC( 0x2, 7 ), /* str=0000010 */
+        MKVLC( 0x3, 8 ), /* str=00000011 */
+        MKVLC( 0x2, 8 ), /* str=00000010 */
+        MKVLC( 0x3, 9 ), /* str=000000011 */
+        MKVLC( 0x2, 9 ), /* str=000000010 */
+        MKVLC( 0x1, 9 ), /* str=000000001 */
+    },
+    { /* i_total 2 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 3 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 4 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 5 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 6 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 7 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 8 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 9 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 10 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 11 */
+        MKVLC( 0x0, 4 ), /* str=0000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 12 */
+        MKVLC( 0x0, 4 ), /* str=0000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 13 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 14 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 15 */
+        MKVLC( 0x0, 1 ), /* str=0 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+};
+
+/* [i_total_coeff-1][i_total_zeros] */
+static const vlc_t x264_total_zeros_dc[3][4] =
+{
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x01, 2 ), /* 01 */
+        MKVLC( 0x01, 3 ), /* 001*/
+        MKVLC( 0x00, 3 )  /* 000*/
+    },
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x01, 2 ), /* 01 */
+        MKVLC( 0x00, 2 ), /* 00 */
+        MKVLC( 0x00, 0 )  /*    */
+    },
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x00, 1 ), /* 0  */
+        MKVLC( 0x00, 0 ), /*    */
+        MKVLC( 0x00, 0 )  /*    */
+    }
+};
+
+/* x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] */
+static const vlc_t x264_run_before[7][15] =
+{
+    { /* i_zero_left 1 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 1 ), /* str=0 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 2 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 3 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 4 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 5 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 6 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 7 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 7 ), /* str=0000001 */
+        MKVLC( 0x1, 8 ), /* str=00000001 */
+        MKVLC( 0x1, 9 ), /* str=000000001 */
+        MKVLC( 0x1, 10 ), /* str=0000000001 */
+        MKVLC( 0x1, 11 ), /* str=00000000001 */
+    },
+};
diff --git a/decoder/decoder.c b/decoder/decoder.c
new file mode 100644
index 00000000..17c70327
--- /dev/null
+++ b/decoder/decoder.c
@@ -0,0 +1,772 @@
+/*****************************************************************************
+ * x264: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: decoder.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/cpu.h"
+#include "../core/vlc.h"
+
+#include "macroblock.h"
+#include "set.h"
+#include "vlc.h"
+
+
+static void x264_slice_idr( x264_t *h )
+{
+    int i;
+
+    h->i_poc_msb = 0;
+    h->i_poc_lsb = 0;
+    h->i_frame_offset = 0;
+    h->i_frame_num = 0;
+
+    if( h->sps )
+    {
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            h->freference[i]->i_poc = -1;
+        }
+
+        h->fdec = h->freference[0];
+        h->i_ref0 = 0;
+        h->i_ref1 = 0;
+    }
+}
+
+/* The slice reading is split in two part:
+ *      - before ref_pic_list_reordering( )
+ *      - after  dec_ref_pic_marking( )
+ */
+static int x264_slice_header_part1_read( bs_t *s,
+                                         x264_slice_header_t *sh, x264_sps_t sps_array[32], x264_pps_t pps_array[256], int b_idr )
+{
+    sh->i_first_mb = bs_read_ue( s );
+    sh->i_type = bs_read_ue( s );
+    if( sh->i_type >= 5 )
+    {
+        sh->i_type -= 5;
+    }
+    sh->i_pps_id = bs_read_ue( s );
+    if( bs_eof( s ) || sh->i_pps_id >= 256 || pps_array[sh->i_pps_id].i_id == -1 )
+    {
+        fprintf( stderr, "invalid pps_id in slice header\n" );
+        return -1;
+    }
+
+    sh->pps = &pps_array[sh->i_pps_id];
+    sh->sps = &sps_array[sh->pps->i_sps_id];    /* valid if pps valid */
+
+    sh->i_frame_num = bs_read( s, sh->sps->i_log2_max_frame_num );
+    if( !sh->sps->b_frame_mbs_only )
+    {
+        sh->b_field_pic = bs_read1( s );
+        if( sh->b_field_pic )
+        {
+            sh->b_bottom_field = bs_read1( s );
+        }
+    }
+
+    if( b_idr )
+    {
+        sh->i_idr_pic_id = bs_read_ue( s );
+    }
+    else
+    {
+        sh->i_idr_pic_id = 0;
+    }
+
+    if( sh->sps->i_poc_type == 0 )
+    {
+        sh->i_poc_lsb = bs_read( s, sh->sps->i_log2_max_poc_lsb );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            sh->i_delta_poc_bottom = bs_read_se( s );
+        }
+    }
+    else if( sh->sps->i_poc_type == 1 && !sh->sps->b_delta_pic_order_always_zero )
+    {
+        sh->i_delta_poc[0] = bs_read_se( s );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            sh->i_delta_poc[1] = bs_read_se( s );
+        }
+    }
+
+    if( sh->pps->b_redundant_pic_cnt )
+    {
+        sh->i_redundant_pic_cnt = bs_read_ue( s );
+    }
+
+    if( sh->i_type == SLICE_TYPE_B )
+    {
+        sh->b_direct_spatial_mv_pred = bs_read1( s );
+    }
+
+    if( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP || sh->i_type == SLICE_TYPE_B )
+    {
+        sh->b_num_ref_idx_override = bs_read1( s );
+
+        sh->i_num_ref_idx_l0_active = sh->pps->i_num_ref_idx_l0_active; /* default */
+        sh->i_num_ref_idx_l1_active = sh->pps->i_num_ref_idx_l1_active; /* default */
+
+        if( sh->b_num_ref_idx_override )
+        {
+            sh->i_num_ref_idx_l0_active = bs_read_ue( s ) + 1;
+            if( sh->i_type == SLICE_TYPE_B )
+            {
+                sh->i_num_ref_idx_l1_active = bs_read_ue( s ) + 1;
+            }
+        }
+    }
+
+    return bs_eof( s ) ? -1 : 0;
+}
+
+static int x264_slice_header_part2_read( bs_t *s, x264_slice_header_t *sh )
+{
+    if( sh->pps->b_cabac && sh->i_type != SLICE_TYPE_I && sh->i_type != SLICE_TYPE_SI )
+    {
+        sh->i_cabac_init_idc = bs_read_ue( s );
+    }
+    sh->i_qp_delta = bs_read_se( s );
+
+    if( sh->i_type == SLICE_TYPE_SI || sh->i_type == SLICE_TYPE_SP )
+    {
+        if( sh->i_type == SLICE_TYPE_SP )
+        {
+            sh->b_sp_for_swidth = bs_read1( s );
+        }
+        sh->i_qs_delta = bs_read_se( s );
+    }
+
+    if( sh->pps->b_deblocking_filter_control )
+    {
+        sh->i_disable_deblocking_filter_idc = bs_read_ue( s );
+        if( sh->i_disable_deblocking_filter_idc != 1 )
+        {
+            sh->i_alpha_c0_offset = bs_read_se( s );
+            sh->i_beta_offset = bs_read_se( s );
+        }
+    }
+    else
+    {
+        sh->i_alpha_c0_offset = 0;
+        sh->i_beta_offset = 0;
+    }
+
+    if( sh->pps->i_num_slice_groups > 1 && sh->pps->i_slice_group_map_type >= 3 && sh->pps->i_slice_group_map_type <= 5 )
+    {
+        /* FIXME */
+        return -1;
+    }
+    return 0;
+}
+
+static int x264_slice_header_ref_pic_reordering( x264_t *h, bs_t *s )
+{
+    int b_ok;
+    int i;
+
+    /* use the no more use frame */
+    h->fdec = h->freference[0];
+    h->fdec->i_poc = h->i_poc;
+
+    /* build ref list 0/1 */
+    h->i_ref0 = 0;
+    h->i_ref1 = 0;
+    for( i = 1; i < h->sps->i_num_ref_frames + 1; i++ )
+    {
+        if( h->freference[i]->i_poc >= 0 )
+        {
+            if( h->freference[i]->i_poc < h->fdec->i_poc )
+            {
+                h->fref0[h->i_ref0++] = h->freference[i];
+            }
+            else if( h->freference[i]->i_poc > h->fdec->i_poc )
+            {
+                h->fref1[h->i_ref1++] = h->freference[i];
+            }
+        }
+    }
+
+    /* Order ref0 from higher to lower poc */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref0 - 1; i++ )
+        {
+            if( h->fref0[i]->i_poc < h->fref0[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref0[i+1];
+
+                h->fref0[i+1] = h->fref0[i];
+                h->fref0[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+    /* Order ref1 from lower to higher poc (bubble sort) for B-frame */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref1 - 1; i++ )
+        {
+            if( h->fref1[i]->i_poc > h->fref1[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref1[i+1];
+
+                h->fref1[i+1] = h->fref1[i];
+                h->fref1[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+
+    if( h->i_ref0 > h->pps->i_num_ref_idx_l0_active )
+    {
+        h->i_ref0 = h->pps->i_num_ref_idx_l0_active;
+    }
+    if( h->i_ref1 > h->pps->i_num_ref_idx_l1_active )
+    {
+        h->i_ref1 = h->pps->i_num_ref_idx_l1_active;
+    }
+
+    //fprintf( stderr,"POC:%d ref0=%d POC0=%d\n", h->fdec->i_poc, h->i_ref0, h->i_ref0 > 0 ? h->fref0[0]->i_poc : -1 );
+
+
+    /* Now parse the stream and change the default order */
+    if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+    {
+        int b_reorder = bs_read1( s );
+
+        if( b_reorder )
+        {
+            /* FIXME */
+            return -1;
+        }
+    }
+    if( h->sh.i_type == SLICE_TYPE_B )
+    {
+        int b_reorder = bs_read1( s );
+        if( b_reorder )
+        {
+            /* FIXME */
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static int x264_slice_header_pred_weight_table( x264_t *h, bs_t *s )
+{
+    return -1;
+}
+
+static int  x264_slice_header_dec_ref_pic_marking( x264_t *h, bs_t *s, int i_nal_type  )
+{
+    if( i_nal_type == NAL_SLICE_IDR )
+    {
+        int b_no_output_of_prior_pics = bs_read1( s );
+        int b_long_term_reference_flag = bs_read1( s );
+
+        /* TODO */
+        if( b_no_output_of_prior_pics )
+        {
+
+        }
+
+        if( b_long_term_reference_flag )
+        {
+
+        }
+    }
+    else
+    {
+        int b_adaptive_ref_pic_marking_mode = bs_read1( s );
+        if( b_adaptive_ref_pic_marking_mode )
+        {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/****************************************************************************
+ * Decode a slice header and setup h for mb decoding.
+ ****************************************************************************/
+static int x264_slice_header_decode( x264_t *h, bs_t *s, x264_nal_t *nal )
+{
+    /* read the first part of the slice */
+    if( x264_slice_header_part1_read( s, &h->sh,
+                                      h->sps_array, h->pps_array,
+                                      nal->i_type == NAL_SLICE_IDR ? 1 : 0 ) < 0 )
+    {
+        fprintf( stderr, "x264_slice_header_part1_read failed\n" );
+        return -1;
+    }
+
+    /* now reset h if needed for this frame */
+    if( h->sps != h->sh.sps || h->pps != h->sh.pps )
+    {
+        int i;
+        /* TODO */
+
+        h->sps = NULL;
+        h->pps = NULL;
+        if( h->picture->i_width != 0 && h->picture->i_height != 0 )
+        {
+            for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+            {
+                x264_frame_delete( h->freference[i]);
+            }
+            free( h->mb );
+        }
+
+        h->picture->i_width = 0;
+        h->picture->i_height = 0;
+    }
+
+    /* and init if needed */
+    if( h->sps == NULL || h->pps == NULL )
+    {
+        int i;
+
+        h->sps = h->sh.sps;
+        h->pps = h->sh.pps;
+
+        h->param.i_width = h->picture->i_width = 16 * h->sps->i_mb_width;
+        h->param.i_height= h->picture->i_height= 16 * h->sps->i_mb_height;
+
+        fprintf( stderr, "x264: %dx%d\n", h->picture->i_width, h->picture->i_height );
+
+        h->mb = x264_macroblocks_new( h->sps->i_mb_width, h->sps->i_mb_height );
+
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            h->freference[i] = x264_frame_new( h );
+            h->freference[i]->i_poc = -1;
+        }
+        h->fdec = h->freference[0];
+        h->i_ref0 = 0;
+        h->i_ref1 = 0;
+
+        h->i_poc_msb = 0;
+        h->i_poc_lsb = 0;
+        h->i_frame_offset = 0;
+        h->i_frame_num = 0;
+    }
+
+    /* calculate poc for current frame */
+    if( h->sps->i_poc_type == 0 )
+    {
+        int i_max_poc_lsb = 1 << h->sps->i_log2_max_poc_lsb;
+
+        if( h->sh.i_poc_lsb < h->i_poc_lsb && h->i_poc_lsb - h->sh.i_poc_lsb >= i_max_poc_lsb/2 )
+        {
+            h->i_poc_msb += i_max_poc_lsb;
+        }
+        else if( h->sh.i_poc_lsb > h->i_poc_lsb  && h->sh.i_poc_lsb - h->i_poc_lsb > i_max_poc_lsb/2 )
+        {
+            h->i_poc_msb -= i_max_poc_lsb;
+        }
+        h->i_poc_lsb = h->sh.i_poc_lsb;
+
+        h->i_poc = h->i_poc_msb + h->sh.i_poc_lsb;
+    }
+    else if( h->sps->i_poc_type == 1 )
+    {
+        /* FIXME */
+        return -1;
+    }
+    else
+    {
+        if( nal->i_type == NAL_SLICE_IDR )
+        {
+            h->i_frame_offset = 0;
+            h->i_poc = 0;
+        }
+        else
+        {
+            if( h->sh.i_frame_num < h->i_frame_num )
+            {
+                h->i_frame_offset += 1 << h->sps->i_log2_max_frame_num;
+            }
+            if( nal->i_ref_idc > 0 )
+            {
+                h->i_poc = 2 * ( h->i_frame_offset + h->sh.i_frame_num );
+            }
+            else
+            {
+                h->i_poc = 2 * ( h->i_frame_offset + h->sh.i_frame_num ) - 1;
+            }
+        }
+        h->i_frame_num = h->sh.i_frame_num;
+    }
+
+    fprintf( stderr, "x264: pic type=%s poc:%d\n",
+             h->sh.i_type == SLICE_TYPE_I ? "I" : (h->sh.i_type == SLICE_TYPE_P ? "P" : "B?" ),
+             h->i_poc );
+
+    if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_P )
+    {
+        fprintf( stderr, "only SLICE I/P supported\n" );
+        return -1;
+    }
+
+    /* read and do the ref pic reordering */
+    if( x264_slice_header_ref_pic_reordering( h, s ) < 0 )
+    {
+        return -1;
+    }
+
+    if( ( (h->sh.i_type == SLICE_TYPE_P || h->sh.i_type == SLICE_TYPE_SP) && h->sh.pps->b_weighted_pred  ) ||
+        ( h->sh.i_type == SLICE_TYPE_B && h->sh.pps->b_weighted_bipred ) )
+    {
+        if( x264_slice_header_pred_weight_table( h, s ) < 0 )
+        {
+            return -1;
+        }
+    }
+
+    if( nal->i_ref_idc != 0 )
+    {
+        x264_slice_header_dec_ref_pic_marking( h, s, nal->i_type );
+    }
+
+    if( x264_slice_header_part2_read( s, &h->sh ) < 0 )
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+static int x264_slice_data_decode( x264_t *h, bs_t *s )
+{
+    int mb_xy = h->sh.i_first_mb;
+    int i_ret = 0;
+
+    if( h->pps->b_cabac )
+    {
+        /* TODO: alignement and cabac init */
+    }
+
+    /* FIXME field decoding */
+    for( ;; )
+    {
+        x264_mb_context_t context;
+        x264_macroblock_t *mb;
+
+        if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+        {
+            break;
+        }
+
+        mb = &h->mb[mb_xy];
+
+        /* load neighbour */
+        x264_macroblock_context_load( h, mb, &context );
+
+
+        if( h->pps->b_cabac )
+        {
+            if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+            {
+                /* TODO */
+            }
+            i_ret = x264_macroblock_read_cabac( h, s, mb );
+        }
+        else
+        {
+            if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+            {
+                int i_skip = bs_read_ue( s );
+
+                while( i_skip > 0 )
+                {
+                    x264_macroblock_decode_skip( h, mb );
+
+                    /* next macroblock */
+                    mb_xy++;
+                    if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+                    {
+                        break;
+                    }
+                    mb++;
+
+                    /* load neighbour */
+                    x264_macroblock_context_load( h, mb, &context );
+
+                    i_skip--;
+                }
+                if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+                {
+                    break;
+                }
+            }
+            i_ret = x264_macroblock_read_cavlc( h, s, mb );
+        }
+
+        if( i_ret < 0 )
+        {
+            fprintf( stderr, "x264_macroblock_read failed [%d,%d]\n", mb->i_mb_x, mb->i_mb_y );
+            break;
+        }
+
+        if( x264_macroblock_decode( h, mb ) < 0 )
+        {
+            fprintf( stderr, "x264_macroblock_decode failed\n" );
+            /* try to do some error correction ;) */
+        }
+
+        mb_xy++;
+    }
+
+    if( i_ret >= 0 )
+    {
+        int i;
+
+        /* expand border for frame reference TODO avoid it when using b-frame */
+        x264_frame_expand_border( h->fdec );
+
+        /* apply deblocking filter to the current decoded picture */
+        if( !h->pps->b_deblocking_filter_control || h->sh.i_disable_deblocking_filter_idc != 1 )
+        {
+            x264_frame_deblocking_filter( h, h->sh.i_type );
+        }
+
+#if 0
+        /* expand border for frame reference TODO avoid it when using b-frame */
+        x264_frame_expand_border( h->fdec );
+#endif
+
+        h->picture->i_plane = h->fdec->i_plane;
+        for( i = 0; i < h->picture->i_plane; i++ )
+        {
+            h->picture->i_stride[i] = h->fdec->i_stride[i];
+            h->picture->plane[i]    = h->fdec->plane[i];
+        }
+
+        /* move frame in the buffer FIXME won't work for B-frame */
+        h->fdec = h->freference[h->sps->i_num_ref_frames];
+        for( i = h->sps->i_num_ref_frames; i > 0; i-- )
+        {
+            h->freference[i] = h->freference[i-1];
+        }
+        h->freference[0] = h->fdec;
+    }
+
+    return i_ret;
+}
+
+/****************************************************************************
+ *
+ ******************************* x264 libs **********************************
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * x264_decoder_open:
+ ****************************************************************************/
+x264_t *x264_decoder_open   ( x264_param_t *param )
+{
+    x264_t *h = x264_malloc( sizeof( x264_t ) );
+    int i;
+
+    memcpy( &h->param, param, sizeof( x264_param_t ) );
+
+    h->cpu = param->cpu;
+
+    /* no SPS and PPS active yet */
+    h->sps = NULL;
+    h->pps = NULL;
+
+    for( i = 0; i < 32; i++ )
+    {
+        h->sps_array[i].i_id = -1;  /* invalidate it */
+    }
+    for( i = 0; i < 256; i++ )
+    {
+        h->pps_array[i].i_id = -1;  /* invalidate it */
+    }
+
+    h->picture = x264_malloc( sizeof( x264_picture_t ) );
+    h->picture->i_width = 0;
+    h->picture->i_height= 0;
+
+    /* init predict_XxX */
+    x264_predict_16x16_init( h->cpu, h->predict_16x16 );
+    x264_predict_8x8_init( h->cpu, h->predict_8x8 );
+    x264_predict_4x4_init( h->cpu, h->predict_4x4 );
+
+    x264_pixel_init( h->cpu, &h->pixf );
+    x264_dct_init( h->cpu, &h->dctf );
+
+    x264_mc_init( h->cpu, h->mc );
+
+    /* create the vlc table (we could remove it from x264_t but it will need
+     * to introduce a x264_init() for global librarie) */
+    for( i = 0; i < 5; i++ )
+    {
+        /* max 2 step */
+        h->x264_coeff_token_lookup[i] = x264_vlc_table_lookup_new( x264_coeff_token[i], 17*4, 4 );
+    }
+    /* max 2 step */
+    h->x264_level_prefix_lookup = x264_vlc_table_lookup_new( x264_level_prefix, 16, 8 );
+
+    for( i = 0; i < 15; i++ )
+    {
+        /* max 1 step */
+        h->x264_total_zeros_lookup[i] = x264_vlc_table_lookup_new( x264_total_zeros[i], 16, 9 );
+    }
+    for( i = 0;i < 3; i++ )
+    {
+        /* max 1 step */
+        h->x264_total_zeros_dc_lookup[i] = x264_vlc_table_lookup_new( x264_total_zeros_dc[i], 4, 3 );
+    }
+    for( i = 0;i < 7; i++ )
+    {
+        /* max 2 step */
+        h->x264_run_before_lookup[i] = x264_vlc_table_lookup_new( x264_run_before[i], 15, 6 );
+    }
+
+    return h;
+}
+
+/****************************************************************************
+ * x264_decoder_decode: decode one nal unit
+ ****************************************************************************/
+int     x264_decoder_decode( x264_t *h,
+                             x264_picture_t **pp_pic, x264_nal_t *nal )
+{
+    int i_ret = 0;
+    bs_t bs;
+
+    /* no picture */
+    *pp_pic = NULL;
+
+    /* init bitstream reader */
+    bs_init( &bs, nal->p_payload, nal->i_payload );
+
+    switch( nal->i_type )
+    {
+        case NAL_SPS:
+            if( ( i_ret = x264_sps_read( &bs, h->sps_array ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_sps_read failed\n" );
+            }
+            break;
+
+        case NAL_PPS:
+            if( ( i_ret = x264_pps_read( &bs, h->pps_array ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_pps_read failed\n" );
+            }
+            break;
+
+        case NAL_SLICE_IDR:
+            fprintf( stderr, "x264: NAL_SLICE_IDR\n" );
+            x264_slice_idr( h );
+
+        case NAL_SLICE:
+            if( ( i_ret = x264_slice_header_decode( h, &bs, nal ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_slice_header_decode failed\n" );
+            }
+            if( h->sh.i_redundant_pic_cnt == 0 && i_ret == 0 )
+            {
+                if( ( i_ret = x264_slice_data_decode( h, &bs ) ) < 0 )
+                {
+                    fprintf( stderr, "x264: x264_slice_data_decode failed\n" );
+                }
+                else
+                {
+                    *pp_pic = h->picture;
+                }
+            }
+            break;
+
+        case NAL_SLICE_DPA:
+        case NAL_SLICE_DPB:
+        case NAL_SLICE_DPC:
+            fprintf( stderr, "partitioned stream unsupported\n" );
+            i_ret = -1;
+            break;
+
+        case NAL_SEI:
+        default:
+            break;
+    }
+
+    /* restore CPU state (before using float again) */
+    x264_cpu_restore( h->cpu );
+
+    return i_ret;
+}
+
+/****************************************************************************
+ * x264_decoder_close:
+ ****************************************************************************/
+void    x264_decoder_close  ( x264_t *h )
+{
+    int i;
+
+    if( h->picture->i_width != 0 && h->picture->i_height != 0 )
+    {
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            x264_frame_delete( h->freference[i]);
+        }
+        x264_free( h->mb );
+    }
+
+    /* free vlc table */
+    for( i = 0; i < 5; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_coeff_token_lookup[i] );
+    }
+    x264_vlc_table_lookup_delete( h->x264_level_prefix_lookup );
+
+    for( i = 0; i < 15; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_total_zeros_lookup[i] );
+    }
+    for( i = 0;i < 3; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_total_zeros_dc_lookup[i] );
+    }
+    for( i = 0;i < 7; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_run_before_lookup[i] );
+    }
+
+    x264_free( h->picture );
+    x264_free( h );
+}
+
diff --git a/decoder/macroblock.c b/decoder/macroblock.c
new file mode 100644
index 00000000..0d580e45
--- /dev/null
+++ b/decoder/macroblock.c
@@ -0,0 +1,1097 @@
+/*****************************************************************************
+ * macroblock.c: h264 decoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "vlc.h"
+#include "macroblock.h"
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int golomb_to_intra4x4_cbp[48]=
+{
+    47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46,
+    16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4,
+     8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41
+};
+static const int golomb_to_inter_cbp[48]=
+{
+     0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13,
+    14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46,
+    17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
+};
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+
+void x264_mb_partition_ref_set( x264_macroblock_t *mb, int i_list, int i_part, int i_ref )
+{
+    int x,  y;
+    int w,  h;
+    int dx, dy;
+
+    x264_mb_partition_getxy( mb, i_part, 0, &x, &y );
+    if( mb->i_partition == D_16x16 )
+    {
+        w = 4; h = 4;
+    }
+    else if( mb->i_partition == D_16x8 )
+    {
+        w = 4; h = 2;
+    }
+    else if( mb->i_partition == D_8x16 )
+    {
+        w = 2; h = 4;
+    }
+    else
+    {
+        /* D_8x8 */
+        w = 2; h = 2;
+    }
+
+    for( dx = 0; dx < w; dx++ )
+    {
+        for( dy = 0; dy < h; dy++ )
+        {
+            mb->partition[x+dx][y+dy].i_ref[i_list] = i_ref;
+        }
+    }
+}
+
+void x264_mb_partition_mv_set( x264_macroblock_t *mb, int i_list, int i_part, int i_sub, int mv[2] )
+{
+    int x,  y;
+    int w,  h;
+    int dx, dy;
+
+    x264_mb_partition_getxy( mb, i_part, i_sub, &x, &y );
+    x264_mb_partition_size ( mb, i_part, i_sub, &w, &h );
+
+    for( dx = 0; dx < w; dx++ )
+    {
+        for( dy = 0; dy < h; dy++ )
+        {
+            mb->partition[x+dx][y+dy].mv[i_list][0] = mv[0];
+            mb->partition[x+dx][y+dy].mv[i_list][1] = mv[1];
+        }
+    }
+}
+
+
+int x264_macroblock_read_cabac( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    return -1;
+}
+
+static int x264_macroblock_decode_ipcm( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    /* TODO */
+    return -1;
+}
+
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+static int bs_read_vlc( bs_t *s, x264_vlc_table_t *table )
+{
+    int i_nb_bits;
+    int i_value = 0;
+    int i_bits;
+    int i_index;
+    int i_level = 0;
+
+    i_index = bs_show( s, table->i_lookup_bits );
+    if( i_index >= table->i_lookup )
+    {
+        return( -1 );
+    }
+    i_value = table->lookup[i_index].i_value;
+    i_bits  = table->lookup[i_index].i_size;
+
+    while( i_bits < 0 )
+    {
+        i_level++;
+        if( i_level > 5 )
+        {
+            return( -1 );        // FIXME what to do ?
+        }
+        bs_skip( s, table->i_lookup_bits );
+        i_nb_bits = -i_bits;
+
+        i_index = bs_show( s, i_nb_bits ) + i_value;
+        if( i_index >= table->i_lookup )
+        {
+            return( -1 );
+        }
+        i_value = table->lookup[i_index].i_value;
+        i_bits  = table->lookup[i_index].i_size;
+    }
+    bs_skip( s, i_bits );
+
+    return( i_value );
+}
+
+static int block_residual_read_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb,
+                                      int i_idx, int *l, int i_count )
+{
+    int i;
+    int level[16], run[16];
+    int i_coeff;
+
+    int i_total, i_trailing;
+    int i_suffix_length;
+    int i_zero_left;
+
+    for( i = 0; i < i_count; i++ )
+    {
+        l[i] = 0;
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        int i_tt;
+
+        if( ( i_tt = bs_read_vlc( s, h->x264_coeff_token_lookup[4] )) < 0 )
+        {
+            return -1;
+        }
+
+        i_total = i_tt / 4;
+        i_trailing = i_tt % 4;
+    }
+    else
+    {
+        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+        int i_tt;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = x264_mb_predict_non_zero_code( h, mb, 0 );
+        }
+        else
+        {
+            nC = x264_mb_predict_non_zero_code( h, mb, i_idx );
+        }
+
+        if( ( i_tt = bs_read_vlc( s, h->x264_coeff_token_lookup[ct_index[nC]] ) ) < 0 )
+        {
+            return -1;
+        }
+
+        i_total = i_tt / 4;
+        i_trailing = i_tt % 4;
+    }
+
+    if( i_idx >= 0 )
+    {
+        mb->block[i_idx].i_non_zero_count = i_total;
+    }
+
+    if( i_total <= 0 )
+    {
+        return 0;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+
+    for( i = 0; i < i_trailing; i++ )
+    {
+        level[i] = 1 - 2 * bs_read1( s );
+    }
+
+    for( ; i < i_total; i++ )
+    {
+        int i_prefix;
+        int i_level_code;
+
+        i_prefix = bs_read_vlc( s, h->x264_level_prefix_lookup );
+
+        if( i_prefix == -1 )
+        {
+            return -1;
+        }
+        else if( i_prefix < 14 )
+        {
+            if( i_suffix_length > 0 )
+            {
+                i_level_code = (i_prefix << i_suffix_length) + bs_read( s, i_suffix_length );
+            }
+            else
+            {
+                i_level_code = i_prefix;
+            }
+        }
+        else if( i_prefix == 14 )
+        {
+            if( i_suffix_length > 0 )
+            {
+                i_level_code = (i_prefix << i_suffix_length) + bs_read( s, i_suffix_length );
+            }
+            else
+            {
+                i_level_code = i_prefix + bs_read( s, 4 );
+            }
+        }
+        else /* if( i_prefix == 15 ) */
+        {
+            i_level_code = (i_prefix << i_suffix_length) + bs_read( s, 12 );
+            if( i_suffix_length == 0 )
+            {
+                i_level_code += 15;
+            }
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code += 2;
+        }
+        /* Optimise */
+        level[i] = i_level_code&0x01 ? -((i_level_code+1)/2) : (i_level_code+2)/2;
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            i_zero_left = bs_read_vlc( s, h->x264_total_zeros_dc_lookup[i_total-1] );
+        }
+        else
+        {
+            i_zero_left = bs_read_vlc( s, h->x264_total_zeros_lookup[i_total-1] );
+        }
+        if( i_zero_left < 0 )
+        {
+            return -1;
+        }
+    }
+    else
+    {
+        i_zero_left = 0;
+    }
+
+    for( i = 0; i < i_total - 1; i++ )
+    {
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+        run[i] = bs_read_vlc( s, h->x264_run_before_lookup[X264_MIN( i_zero_left - 1, 6 )] );
+
+        if( run[i] < 0 )
+        {
+            return -1;
+        }
+        i_zero_left -= run[i];
+    }
+    if( i_zero_left < 0 )
+    {
+        return -1;
+    }
+
+    for( ; i < i_total - 1; i++ )
+    {
+        run[i] = 0;
+    }
+    run[i_total-1] = i_zero_left;
+
+    i_coeff = -1;
+    for( i = i_total - 1; i >= 0; i-- )
+    {
+        i_coeff += run[i] + 1;
+        l[i_coeff] = level[i];
+    }
+
+    return 0;
+}
+
+static inline void array_zero_set( int *l, int i_count )
+{
+    int i;
+
+    for( i = 0; i < i_count; i++ )
+    {
+        l[i] = 0;
+    }
+}
+
+int x264_macroblock_read_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    int i_mb_i_offset;
+    int i_mb_p_offset;
+    int b_sub_ref0 = 0;
+    int i_type;
+    int i;
+
+    /* read the mb type */
+    switch( h->sh.i_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_p_offset = 0;  /* shut up gcc */
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_p_offset = 0;
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_p_offset = 23;
+            i_mb_i_offset = 23 + 5;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return -1;
+    }
+
+    i_type = bs_read_ue( s );
+
+    if( i_type < i_mb_i_offset )
+    {
+        if( i_type < i_mb_p_offset )
+        {
+            fprintf( stderr, "unsupported mb type(B*)\n" );
+            /* TODO for B frame */
+            return -1;
+        }
+        else
+        {
+            i_type -= i_mb_p_offset;
+
+            if( i_type == 0 )
+            {
+                mb->i_type = P_L0;
+                mb->i_partition = D_16x16;
+            }
+            else if( i_type == 1 )
+            {
+                mb->i_type = P_L0;
+                mb->i_partition = D_16x8;
+            }
+            else if( i_type == 2 )
+            {
+                mb->i_type = P_L0;
+                mb->i_partition = D_8x16;
+            }
+            else if( i_type == 3 || i_type == 4 )
+            {
+                mb->i_type = P_8x8;
+                mb->i_partition = D_8x8;
+                b_sub_ref0 = i_type == 4 ? 1 : 0;
+            }
+            else
+            {
+                fprintf( stderr, "invalid mb type\n" );
+                return -1;
+            }
+        }
+    }
+    else
+    {
+        i_type -= i_mb_i_offset;
+
+        if( i_type == 0 )
+        {
+            mb->i_type = I_4x4;
+        }
+        else if( i_type < 25 )
+        {
+            mb->i_type = I_16x16;
+
+            mb->i_intra16x16_pred_mode = (i_type - 1)%4;
+            mb->i_cbp_chroma = ( (i_type-1) / 4 )%3;
+            mb->i_cbp_luma   = ((i_type-1) / 12) ? 0x0f : 0x00;
+        }
+        else if( i_type == 25 )
+        {
+            mb->i_type = I_PCM;
+        }
+        else
+        {
+            fprintf( stderr, "invalid mb type (%d)\n", i_type );
+            return -1;
+        }
+    }
+
+    if( mb->i_type == I_PCM )
+    {
+        return x264_macroblock_decode_ipcm( h, s, mb );
+    }
+
+    if( IS_INTRA( mb->i_type ) )
+    {
+        if( mb->i_type == I_4x4 )
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                int b_coded;
+
+                b_coded = bs_read1( s );
+
+                if( b_coded )
+                {
+                    mb->block[i].i_intra4x4_pred_mode = x264_mb_predict_intra4x4_mode( h, mb, i );
+                }
+                else
+                {
+                    int i_predicted_mode = x264_mb_predict_intra4x4_mode( h, mb, i );
+                    int i_mode = bs_read( s, 3 );
+
+                    if( i_mode >= i_predicted_mode )
+                    {
+                        mb->block[i].i_intra4x4_pred_mode = i_mode + 1;
+                    }
+                    else
+                    {
+                        mb->block[i].i_intra4x4_pred_mode = i_mode;
+                    }
+                }
+            }
+        }
+
+        mb->i_chroma_pred_mode = bs_read_ue( s );
+    }
+    else if( mb->i_type == P_8x8 || mb->i_type == B_8x8)
+    {
+        /* FIXME won't work for B_8x8 */
+
+        for( i = 0; i < 4; i++ )
+        {
+            int i_sub_partition;
+
+            i_sub_partition = bs_read_ue( s );
+            switch( i_sub_partition )
+            {
+                case 0:
+                    mb->i_sub_partition[i] = D_L0_8x8;
+                    break;
+                case 1:
+                    mb->i_sub_partition[i] = D_L0_8x4;
+                    break;
+                case 2:
+                    mb->i_sub_partition[i] = D_L0_4x8;
+                    break;
+                case 3:
+                    mb->i_sub_partition[i] = D_L0_4x4;
+                    break;
+                default:
+                    fprintf( stderr, "invalid i_sub_partition\n" );
+                    return -1;
+            }
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int i_ref;
+
+            i_ref = b_sub_ref0 ? 0 : bs_read_te( s, h->sh.i_num_ref_idx_l0_active - 1 );
+            x264_mb_partition_ref_set( mb, 0, i, i_ref );
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int i_sub;
+            int i_ref;
+
+            x264_mb_partition_get( mb, 0, i, 0, &i_ref, NULL, NULL );
+
+            for( i_sub = 0; i_sub < x264_mb_partition_count_table[mb->i_sub_partition[i]]; i_sub++ )
+            {
+                int mv[2];
+
+                x264_mb_predict_mv( mb, 0, i, i_sub, mv );
+                mv[0] += bs_read_se( s );
+                mv[1] += bs_read_se( s );
+
+                x264_mb_partition_mv_set( mb, 0, i, i_sub, mv );
+            }
+        }
+    }
+    else if( mb->i_type != B_DIRECT )
+    {
+        /* FIXME will work only for P block */
+
+        /* FIXME using x264_mb_partition_set/x264_mb_partition_get here are too unoptimised
+         * I should introduce ref and mv get/set */
+
+        /* Motion Vector */
+        int i_part = x264_mb_partition_count_table[mb->i_partition];
+
+        for( i = 0; i < i_part; i++ )
+        {
+            int i_ref;
+
+            i_ref = bs_read_te( s, h->sh.i_num_ref_idx_l0_active - 1 );
+
+            x264_mb_partition_ref_set( mb, 0, i, i_ref );
+        }
+
+        for( i = 0; i < i_part; i++ )
+        {
+            int mv[2];
+
+            x264_mb_predict_mv( mb, 0, i, 0, mv );
+
+            mv[0] += bs_read_se( s );
+            mv[1] += bs_read_se( s );
+
+            x264_mb_partition_mv_set( mb, 0, i, 0, mv );
+        }
+    }
+
+    if( mb->i_type != I_16x16 )
+    {
+        int i_cbp;
+
+        i_cbp = bs_read_ue( s );
+        if( i_cbp >= 48 )
+        {
+            fprintf( stderr, "invalid cbp\n" );
+            return -1;
+        }
+
+        if( mb->i_type == I_4x4 )
+        {
+            i_cbp = golomb_to_intra4x4_cbp[i_cbp];
+        }
+        else
+        {
+            i_cbp = golomb_to_inter_cbp[i_cbp];
+        }
+        mb->i_cbp_luma   = i_cbp&0x0f;
+        mb->i_cbp_chroma = i_cbp >> 4;
+    }
+
+    if( mb->i_cbp_luma > 0 || mb->i_cbp_chroma > 0 || mb->i_type == I_16x16 )
+    {
+        mb->i_qp = bs_read_se( s ) + h->pps->i_pic_init_qp + h->sh.i_qp_delta;
+
+        /* write residual */
+        if( mb->i_type == I_16x16 )
+        {
+            /* DC Luma */
+            if( block_residual_read_cavlc( h, s, mb, BLOCK_INDEX_LUMA_DC , mb->luma16x16_dc, 16 ) < 0 )
+            {
+                return -1;
+            }
+
+            if( mb->i_cbp_luma != 0 )
+            {
+                /* AC Luma */
+                for( i = 0; i < 16; i++ )
+                {
+                    if( block_residual_read_cavlc( h, s, mb, i, mb->block[i].residual_ac, 15 ) < 0 )
+                    {
+                        return -1;
+                    }
+                }
+            }
+            else
+            {
+                for( i = 0; i < 16; i++ )
+                {
+                    mb->block[i].i_non_zero_count = 0;
+                    array_zero_set( mb->block[i].residual_ac, 15 );
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                if( mb->i_cbp_luma & ( 1 << ( i / 4 ) ) )
+                {
+                    if( block_residual_read_cavlc( h, s, mb, i, mb->block[i].luma4x4, 16 ) < 0 )
+                    {
+                        return -1;
+                    }
+                }
+                else
+                {
+                    mb->block[i].i_non_zero_count = 0;
+                    array_zero_set( mb->block[i].luma4x4, 16 );
+                }
+            }
+        }
+
+        if( mb->i_cbp_chroma &0x03 )    /* Chroma DC residual present */
+        {
+            if( block_residual_read_cavlc( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[0], 4 ) < 0 ||
+                block_residual_read_cavlc( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[1], 4 ) < 0 )
+            {
+                return -1;
+            }
+        }
+        else
+        {
+            array_zero_set( mb->chroma_dc[0], 4 );
+            array_zero_set( mb->chroma_dc[1], 4 );
+        }
+        if( mb->i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                if( block_residual_read_cavlc( h, s, mb, 16 + i, mb->block[16+i].residual_ac, 15 ) < 0 )
+                {
+                    return -1;
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                mb->block[16+i].i_non_zero_count = 0;
+                array_zero_set( mb->block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+    else
+    {
+        mb->i_qp = h->pps->i_pic_init_qp + h->sh.i_qp_delta;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = 0;
+            array_zero_set( mb->block[i].luma4x4, 16 );
+        }
+        array_zero_set( mb->chroma_dc[0], 4 );
+        array_zero_set( mb->chroma_dc[1], 4 );
+        for( i = 0; i < 8; i++ )
+        {
+            array_zero_set( mb->block[16+i].residual_ac, 15 );
+            mb->block[16+i].i_non_zero_count = 0;
+        }
+    }
+
+    //fprintf( stderr, "mb read type=%d\n", mb->i_type );
+
+    return 0;
+}
+
+
+
+
+static int x264_mb_pred_mode16x16_valid( x264_macroblock_t *mb, int i_mode )
+{
+    if( ( mb->i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        return i_mode;
+    }
+    else if( ( mb->i_neighbour & MB_LEFT ) )
+    {
+        if( i_mode == I_PRED_16x16_DC )
+        {
+            return I_PRED_16x16_DC_LEFT;
+        }
+        else if( i_mode == I_PRED_16x16_H )
+        {
+            return I_PRED_16x16_H;
+        }
+
+        fprintf( stderr, "invalid 16x16 prediction\n" );
+        return I_PRED_16x16_DC_LEFT;
+    }
+    else if( ( mb->i_neighbour & MB_TOP ) )
+    {
+        if( i_mode == I_PRED_16x16_DC )
+        {
+            return I_PRED_16x16_DC_TOP;
+        }
+        else if( i_mode == I_PRED_16x16_V )
+        {
+            return I_PRED_16x16_V;
+        }
+
+        fprintf( stderr, "invalid 16x16 prediction\n" );
+        return I_PRED_16x16_DC_TOP;
+    }
+    else
+    {
+        return I_PRED_16x16_DC_128;
+    }
+}
+
+static int x264_mb_pred_mode8x8_valid( x264_macroblock_t *mb, int i_mode )
+{
+    if( ( mb->i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        return i_mode;
+    }
+    else if( ( mb->i_neighbour & MB_LEFT ) )
+    {
+        if( i_mode == I_PRED_CHROMA_DC )
+        {
+            return I_PRED_CHROMA_DC_LEFT;
+        }
+        else if( i_mode == I_PRED_CHROMA_H )
+        {
+            return I_PRED_CHROMA_H;
+        }
+
+        fprintf( stderr, "invalid 8x8 prediction\n" );
+        return I_PRED_CHROMA_DC_LEFT;
+    }
+    else if( ( mb->i_neighbour & MB_TOP ) )
+    {
+        if( i_mode == I_PRED_CHROMA_DC )
+        {
+            return I_PRED_CHROMA_DC_TOP;
+        }
+        else if( i_mode == I_PRED_CHROMA_V )
+        {
+            return I_PRED_CHROMA_V;
+        }
+
+        fprintf( stderr, "invalid 8x8 prediction\n" );
+        return I_PRED_CHROMA_DC_TOP;
+    }
+    else
+    {
+        return I_PRED_CHROMA_DC_128;
+    }
+}
+
+static int x264_mb_pred_mode4x4_valid( x264_macroblock_t *mb, int idx, int i_mode, int *pb_emu )
+{
+    int b_a, b_b, b_c;
+    static const int needmb[16] =
+    {
+        MB_LEFT|MB_TOP, MB_TOP,
+        MB_LEFT,        MB_PRIVATE,
+        MB_TOP,         MB_TOP|MB_TOPRIGHT,
+        0,              MB_PRIVATE,
+        MB_LEFT,        0,
+        MB_LEFT,        MB_PRIVATE,
+        0,              MB_PRIVATE,
+        0,              MB_PRIVATE
+    };
+    int b_emu = 0;
+
+    *pb_emu = 0;
+
+    b_a = (needmb[idx]&mb->i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
+    b_b = (needmb[idx]&mb->i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
+    b_c = (needmb[idx]&mb->i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
+
+    if( b_c == 0 && b_b )
+    {
+        b_emu = 1;
+        b_c = 1;
+    }
+
+    /* handle I_PRED_4x4_DC */
+    if( i_mode == I_PRED_4x4_DC )
+    {
+        if( b_a && b_b )
+        {
+            return I_PRED_4x4_DC;
+        }
+        else if( b_a )
+        {
+            return I_PRED_4x4_DC_LEFT;
+        }
+        else if( b_b )
+        {
+            return I_PRED_4x4_DC_TOP;
+        }
+        return I_PRED_4x4_DC_128;
+    }
+
+    /* handle 1 dir needed only */
+    if( ( b_a && i_mode == I_PRED_4x4_H ) ||
+        ( b_b && i_mode == I_PRED_4x4_V ) )
+    {
+        return i_mode;
+    }
+
+    /* handle b_c case (b_b always true) */
+    if( b_c && ( i_mode == I_PRED_4x4_DDL || i_mode == I_PRED_4x4_VL ) )
+    {
+        *pb_emu = b_emu;
+        return i_mode;
+    }
+
+    if( b_a && b_b )
+    {
+        /* I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU */
+        return i_mode;
+    }
+
+    fprintf( stderr, "invalid 4x4 predict mode(%d, mb:%x-%x idx:%d\n", i_mode, mb->i_mb_x, mb->i_mb_y, idx );
+    return I_PRED_CHROMA_DC_128;    /* unefficient */
+}
+
+/****************************************************************************
+ * UnScan functions
+ ****************************************************************************/
+static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void unscan_zigzag_4x4full( int16_t dct[4][4], int level[16] )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dct[scan_zigzag_y[i]][scan_zigzag_x[i]] = level[i];
+    }
+}
+static inline void unscan_zigzag_4x4( int16_t dct[4][4], int level[15] )
+{
+    int i;
+
+    for( i = 1; i < 16; i++ )
+    {
+        dct[scan_zigzag_y[i]][scan_zigzag_x[i]] = level[i - 1];
+    }
+}
+
+static inline void unscan_zigzag_2x2_dc( int16_t dct[2][2], int level[4] )
+{
+    dct[0][0] = level[0];
+    dct[0][1] = level[1];
+    dct[1][0] = level[2];
+    dct[1][1] = level[3];
+}
+
+
+int x264_macroblock_decode( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    int i_qscale;
+    int ch;
+    int i;
+
+    if( !IS_INTRA(mb->i_type ) )
+    {
+        /* Motion compensation */
+        x264_mb_mc( h, mb );
+    }
+
+    /* luma */
+    i_qscale = mb->i_qp;
+    if( mb->i_type == I_16x16 )
+    {
+        int     i_mode = x264_mb_pred_mode16x16_valid( mb, mb->i_intra16x16_pred_mode );
+        int16_t luma[16][4][4];
+        int16_t dct4x4[16+1][4][4];
+
+
+        /* do the right prediction */
+        h->predict_16x16[i_mode]( ctx->p_fdec[0], ctx->i_fdec[0] );
+
+        /* get dc coeffs */
+        unscan_zigzag_4x4full( dct4x4[0], mb->luma16x16_dc );
+        h->dctf.idct4x4dc( dct4x4[0], dct4x4[0] );
+        x264_mb_dequant_4x4_dc( dct4x4[0], i_qscale );
+
+        /* decode the 16x16 macroblock */
+        for( i = 0; i < 16; i++ )
+        {
+            unscan_zigzag_4x4( dct4x4[1+i], mb->block[i].residual_ac );
+            x264_mb_dequant_4x4( dct4x4[1+i], i_qscale );
+
+            /* copy dc coeff */
+            dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+
+            h->dctf.idct4x4( luma[i], dct4x4[i+1] );
+        }
+        /* put pixels to fdec */
+        h->pixf.add16x16( ctx->p_fdec[0], ctx->i_fdec[0], luma );
+    }
+    else if( mb->i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            int16_t luma[4][4];
+            int16_t dct4x4[4][4];
+
+            uint8_t *p_dst_by;
+            int     i_mode;
+            int     b_emu;
+
+            /* Do the right prediction */
+            p_dst_by = ctx->p_fdec[0] + 4 * block_idx_x[i] + 4 * block_idx_y[i] * ctx->i_fdec[0];
+            i_mode   = x264_mb_pred_mode4x4_valid( mb, i, mb->block[i].i_intra4x4_pred_mode, &b_emu );
+            if( b_emu )
+            {
+                fprintf( stderr, "mmmh b_emu\n" );
+                memset( &p_dst_by[4], p_dst_by[3], 4 );
+            }
+            h->predict_4x4[i_mode]( p_dst_by, ctx->i_fdec[0] );
+
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                /* decode one 4x4 block */
+                unscan_zigzag_4x4full( dct4x4, mb->block[i].luma4x4 );
+
+                x264_mb_dequant_4x4( dct4x4, i_qscale );
+
+                h->dctf.idct4x4( luma, dct4x4 );
+
+                h->pixf.add4x4( p_dst_by, ctx->i_fdec[0], luma );
+            }
+        }
+    }
+    else /* Inter mb */
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            uint8_t *p_dst_by;
+            int16_t luma[4][4];
+            int16_t dct4x4[4][4];
+
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                unscan_zigzag_4x4full( dct4x4, mb->block[i].luma4x4 );
+                x264_mb_dequant_4x4( dct4x4, i_qscale );
+
+                h->dctf.idct4x4( luma, dct4x4 );
+
+                p_dst_by = ctx->p_fdec[0] + 4 * block_idx_x[i] + 4 * block_idx_y[i] * ctx->i_fdec[0];
+                h->pixf.add4x4( p_dst_by, ctx->i_fdec[0], luma );
+            }
+        }
+    }
+
+    /* chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( mb->i_type ) )
+    {
+        int i_mode = x264_mb_pred_mode8x8_valid( mb, mb->i_chroma_pred_mode );
+        /* do the right prediction */
+        h->predict_8x8[i_mode]( ctx->p_fdec[1], ctx->i_fdec[1] );
+        h->predict_8x8[i_mode]( ctx->p_fdec[2], ctx->i_fdec[2] );
+    }
+
+    if( mb->i_cbp_chroma != 0 )
+    {
+        for( ch = 0; ch < 2; ch++ )
+        {
+            int16_t chroma[4][4][4];
+            int16_t dct2x2[2][2];
+            int16_t dct4x4[4][4][4];
+
+            /* get dc chroma */
+            unscan_zigzag_2x2_dc( dct2x2, mb->chroma_dc[ch] );
+            h->dctf.idct2x2dc( dct2x2, dct2x2 );
+            x264_mb_dequant_2x2_dc( dct2x2, i_qscale );
+
+            for( i = 0; i < 4; i++ )
+            {
+                unscan_zigzag_4x4( dct4x4[i], mb->block[16+i+ch*4].residual_ac );
+                x264_mb_dequant_4x4( dct4x4[i], i_qscale );
+
+                /* copy dc coeff */
+                dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+
+                h->dctf.idct4x4( chroma[i], dct4x4[i] );
+            }
+            h->pixf.add8x8( ctx->p_fdec[1+ch], ctx->i_fdec[1+ch], chroma );
+        }
+    }
+
+    return 0;
+}
+
+void x264_macroblock_decode_skip( x264_t *h, x264_macroblock_t *mb )
+{
+    int i;
+    int x, y;
+    int mv[2];
+
+    /* decode it as a 16x16 with no luma/chroma */
+    mb->i_type = P_L0;
+    mb->i_partition = D_16x16;
+    mb->i_cbp_luma = 0;
+    mb->i_cbp_chroma = 0;
+    for( i = 0; i < 16 + 8; i++ )
+    {
+        mb->block[i].i_non_zero_count = 0;
+    }
+    for( i = 0; i < 16; i++ )
+    {
+        array_zero_set( mb->block[i].luma4x4, 16 );
+    }
+    array_zero_set( mb->chroma_dc[0], 4 );
+    array_zero_set( mb->chroma_dc[1], 4 );
+    for( i = 0; i < 8; i++ )
+    {
+        array_zero_set( mb->block[16+i].residual_ac, 15 );
+    }
+
+    /* set ref0 */
+    for( x = 0; x < 4; x++ )
+    {
+        for( y = 0; y < 4; y++ )
+        {
+            mb->partition[x][y].i_ref[0] = 0;
+        }
+    }
+    /* get mv */
+    x264_mb_predict_mv_pskip( mb, mv );
+
+    x264_mb_partition_mv_set( mb, 0, 0, 0, mv );
+
+    /* Motion compensation */
+    x264_mb_mc( h, mb );
+
+    mb->i_type = P_SKIP;
+}
+
diff --git a/decoder/macroblock.h b/decoder/macroblock.h
new file mode 100644
index 00000000..96b5d2eb
--- /dev/null
+++ b/decoder/macroblock.h
@@ -0,0 +1,34 @@
+/*****************************************************************************
+ * macroblock.h: h264 decoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_MACROBLOCK_H
+#define _DECODER_MACROBLOCK_H 1
+
+int  x264_macroblock_read_cabac( x264_t *h, bs_t *s, x264_macroblock_t *mb );
+int  x264_macroblock_read_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb );
+
+int  x264_macroblock_decode( x264_t *h, x264_macroblock_t *mb );
+void x264_macroblock_decode_skip( x264_t *h, x264_macroblock_t *mb );
+
+#endif
+
diff --git a/decoder/set.c b/decoder/set.c
new file mode 100644
index 00000000..fb489970
--- /dev/null
+++ b/decoder/set.c
@@ -0,0 +1,262 @@
+/*****************************************************************************
+ * x264: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "set.h"
+
+/* return -1 if invalid, else the id */
+int x264_sps_read( bs_t *s, x264_sps_t sps_array[32] )
+{
+    x264_sps_t *sps;
+
+    int i_profile_idc;
+    int i_level_idc;
+
+    int b_constraint_set0;
+    int b_constraint_set1;
+    int b_constraint_set2;
+
+    int id;
+
+    i_profile_idc     = bs_read( s, 8 );
+    b_constraint_set0 = bs_read( s, 1 );
+    b_constraint_set1 = bs_read( s, 1 );
+    b_constraint_set2 = bs_read( s, 1 );
+
+    bs_skip( s, 5 );    /* reserved */
+    i_level_idc       = bs_read( s, 8 );
+
+
+    id = bs_read_ue( s );
+    if( bs_eof( s ) || id >= 32 )
+    {
+        /* the sps is invalid, no need to corrupt sps_array[0] */
+        return -1;
+    }
+
+    sps = &sps_array[id];
+    sps->i_id = id;
+
+    /* put pack parsed value */
+    sps->i_profile_idc     = i_profile_idc;
+    sps->i_level_idc       = i_level_idc;
+    sps->b_constraint_set0 = b_constraint_set0;
+    sps->b_constraint_set1 = b_constraint_set1;
+    sps->b_constraint_set2 = b_constraint_set2;
+
+    sps->i_log2_max_frame_num = bs_read_ue( s ) + 4;
+
+    sps->i_poc_type = bs_read_ue( s );
+    if( sps->i_poc_type == 0 )
+    {
+        sps->i_log2_max_poc_lsb = bs_read_ue( s ) + 4;
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+        sps->b_delta_pic_order_always_zero = bs_read( s, 1 );
+        sps->i_offset_for_non_ref_pic = bs_read_se( s );
+        sps->i_offset_for_top_to_bottom_field = bs_read_se( s );
+        sps->i_num_ref_frames_in_poc_cycle = bs_read_ue( s );
+        if( sps->i_num_ref_frames_in_poc_cycle > 256 )
+        {
+            /* FIXME what to do */
+            sps->i_num_ref_frames_in_poc_cycle = 256;
+        }
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            sps->i_offset_for_ref_frame[i] = bs_read_se( s );
+        }
+    }
+    else if( sps->i_poc_type > 2 )
+    {
+        goto error;
+    }
+
+    sps->i_num_ref_frames = bs_read_ue( s );
+    sps->b_gaps_in_frame_num_value_allowed = bs_read( s, 1 );
+    sps->i_mb_width = bs_read_ue( s ) + 1;
+    sps->i_mb_height= bs_read_ue( s ) + 1;
+    sps->b_frame_mbs_only = bs_read( s, 1 );
+    if( !sps->b_frame_mbs_only )
+    {
+        sps->b_mb_adaptive_frame_field = bs_read( s, 1 );
+    }
+    else
+    {
+        sps->b_mb_adaptive_frame_field = 0;
+    }
+    sps->b_direct8x8_inference = bs_read( s, 1 );
+
+    sps->b_crop = bs_read( s, 1 );
+    if( sps->b_crop )
+    {
+        sps->crop.i_left  = bs_read_ue( s );
+        sps->crop.i_right = bs_read_ue( s );
+        sps->crop.i_top   = bs_read_ue( s );
+        sps->crop.i_bottom= bs_read_ue( s );
+    }
+    else
+    {
+        sps->crop.i_left  = 0;
+        sps->crop.i_right = 0;
+        sps->crop.i_top   = 0;
+        sps->crop.i_bottom= 0;
+    }
+
+    sps->b_vui = bs_read( s, 1 );
+    if( sps->b_vui )
+    {
+        /* FIXME */
+    }
+    else
+    {
+
+    }
+
+    if( bs_eof( s ) )
+    {
+        /* no rbsp trailing */
+        fprintf( stderr, "incomplete SPS\n" );
+        goto error;
+    }
+
+    fprintf( stderr, "x264_sps_read: sps:0x%x profile:%d/%d poc:%d ref:%d %xx%d crop:%d-%d-%d-%d\n",
+             sps->i_id,
+             sps->i_profile_idc, sps->i_level_idc,
+             sps->i_poc_type,
+             sps->i_num_ref_frames,
+             sps->i_mb_width, sps->i_mb_height,
+             sps->crop.i_left, sps->crop.i_right,
+             sps->crop.i_top, sps->crop.i_bottom );
+
+    return id;
+
+error:
+    /* invalidate this sps */
+    sps->i_id = -1;
+    return -1;
+}
+
+/* return -1 if invalid, else the id */
+int x264_pps_read( bs_t *s, x264_pps_t pps_array[256] )
+{
+    x264_pps_t *pps;
+    int id;
+    int i;
+
+    id = bs_read_ue( s );
+    if( bs_eof( s ) || id >= 256 )
+    {
+        fprintf( stderr, "id invalid\n" );
+        return -1;
+    }
+    pps = &pps_array[id];
+    pps->i_id = id;
+    pps->i_sps_id = bs_read_ue( s );
+    if( pps->i_sps_id >= 32 )
+    {
+        goto error;
+    }
+    pps->b_cabac = bs_read( s, 1 );
+    pps->b_pic_order = bs_read( s, 1 );
+    pps->i_num_slice_groups = bs_read_ue( s ) + 1;
+    if( pps->i_num_slice_groups > 1 )
+    {
+        fprintf( stderr, "FMO unsupported\n " );
+
+        pps->i_slice_group_map_type  =bs_read_ue( s );
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_run_length[i] = bs_read_ue( s );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_top_left[i] = bs_read_ue( s );
+                pps->i_bottom_right[i] = bs_read_ue( s );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 3 ||
+                 pps->i_slice_group_map_type == 4 ||
+                 pps->i_slice_group_map_type == 5 )
+        {
+            pps->b_slice_group_change_direction = bs_read( s, 1 );
+            pps->i_slice_group_change_rate = bs_read_ue( s ) + 1;
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            pps->i_pic_size_in_map_units = bs_read_ue( s ) + 1;
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+               /*  FIXME */
+                /* pps->i_slice_group_id = bs_read( s, ceil( log2( pps->i_pic_size_in_map_units +1 ) ) ); */
+            }
+        }
+    }
+    pps->i_num_ref_idx_l0_active = bs_read_ue( s ) + 1;
+    pps->i_num_ref_idx_l1_active = bs_read_ue( s ) + 1;
+    pps->b_weighted_pred = bs_read( s, 1 );
+    pps->b_weighted_bipred = bs_read( s, 2 );
+
+    pps->i_pic_init_qp = bs_read_se( s ) + 26;
+    pps->i_pic_init_qs = bs_read_se( s ) + 26;
+
+    pps->i_chroma_qp_index_offset = bs_read_se( s );
+
+    pps->b_deblocking_filter_control = bs_read( s, 1 );
+    pps->b_constrained_intra_pred = bs_read( s, 1 );
+    pps->b_redundant_pic_cnt = bs_read( s, 1 );
+
+    if( bs_eof( s ) )
+    {
+        /* no rbsp trailing */
+        fprintf( stderr, "incomplete PPS\n" );
+        goto error;
+    }
+    fprintf( stderr, "x264_sps_read: pps:0x%x sps:0x%x %s slice_groups=%d ref0:%d ref1:%d QP:%d QS:%d QC=%d DFC:%d CIP:%d RPC:%d\n",
+             pps->i_id, pps->i_sps_id,
+             pps->b_cabac ? "CABAC" : "CAVLC",
+             pps->i_num_slice_groups,
+             pps->i_num_ref_idx_l0_active,
+             pps->i_num_ref_idx_l1_active,
+             pps->i_pic_init_qp, pps->i_pic_init_qs, pps->i_chroma_qp_index_offset,
+             pps->b_deblocking_filter_control,
+             pps->b_constrained_intra_pred,
+             pps->b_redundant_pic_cnt );
+
+    return id;
+error:
+    pps->i_id = -1;
+    return -1;
+}
+
diff --git a/decoder/set.h b/decoder/set.h
new file mode 100644
index 00000000..62719457
--- /dev/null
+++ b/decoder/set.h
@@ -0,0 +1,33 @@
+/*****************************************************************************
+ * set.h: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_SET_H
+#define _DECODER_SET_H 1
+
+/* return -1 if invalid, else the id */
+int x264_sps_read( bs_t *s, x264_sps_t sps_array[32] );
+
+/* return -1 if invalid, else the id */
+int x264_pps_read( bs_t *s, x264_pps_t pps_array[256] );
+
+#endif
diff --git a/decoder/vlc.c b/decoder/vlc.c
new file mode 100644
index 00000000..2109e8c6
--- /dev/null
+++ b/decoder/vlc.c
@@ -0,0 +1,236 @@
+/*****************************************************************************
+ * vlc.c: VLC lookup table generation.
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "vlc.h"
+
+
+static int  vlc_table_realloc( x264_vlc_table_t *table, int i_size )
+{
+    int i_index;
+
+    i_index = table->i_lookup;
+
+    table->i_lookup += i_size;
+    table->lookup = x264_realloc( table->lookup, sizeof( vlc_lookup_t ) * table->i_lookup );
+
+    return( i_index );
+}
+
+static int vlc_table_create_part( x264_vlc_table_t *table, const vlc_t *vlc, int i_lookup_bits, int i_nb_vlc, int i_prefix_code, int i_prefix_length )
+{
+    int i;
+    int i_nb_lookup;
+    vlc_lookup_t *lookup;
+    int i_table_index;
+
+    i_nb_lookup = 1 << i_lookup_bits;
+
+    i_table_index = vlc_table_realloc( table, i_nb_lookup );
+    lookup = &table->lookup[i_table_index];
+
+    for( i = 0; i < i_nb_lookup; i++ )
+    {
+        lookup[i].i_value  = -1;
+        lookup[i].i_size = 0;
+    }
+
+    for( i = 0; i < i_nb_vlc; i++ )
+    {
+        int i_bits;
+        if( vlc[i].i_size <= 0 )
+        {
+            continue;
+        }
+
+        i_bits = vlc[i].i_size - i_prefix_length;
+        if( i_bits > 0 && ( vlc[i].i_bits >> i_bits ) == i_prefix_code )
+        {
+            if( i_bits <= i_lookup_bits )
+            {
+                int i_lookup_index;
+                int nb;
+
+                i_lookup_index = ( vlc[i].i_bits << ( i_lookup_bits - i_bits ) )%i_nb_lookup;
+                nb = 1 << ( i_lookup_bits - i_bits );
+                for( nb = 0; nb < (1 << ( i_lookup_bits - i_bits)); nb++ )
+                {
+                    lookup[i_lookup_index].i_value = i; /* vlc[i].i_value; */
+                    lookup[i_lookup_index].i_size = i_bits;
+                    i_lookup_index++;
+                }
+            }
+            else
+            {
+                int i_bits_max;
+                int i_lookup_index;
+                /* need another table */
+                i_lookup_index = ( vlc[i].i_bits >> (i_bits - i_lookup_bits ) )%i_nb_lookup;
+
+                i_bits_max =  -lookup[i_lookup_index].i_size;
+                if( i_bits_max < i_bits - i_lookup_bits )
+                {
+                    i_bits_max = i_bits - i_lookup_bits;
+                }
+                lookup[i_lookup_index].i_size = -i_bits_max;
+            }
+        }
+    }
+
+    /* create other level table */
+    for( i = 0; i < i_nb_lookup; i++ )
+    {
+        if( lookup[i].i_size < 0 )
+        {
+            int i_bits;
+            int i_index;
+            i_bits = -lookup[i].i_size;
+            if( i_bits > i_lookup_bits )
+            {
+                lookup[i].i_size = -i_lookup_bits;
+                i_bits = i_lookup_bits;
+            }
+
+            i_index = vlc_table_create_part( table, vlc, i_bits, i_nb_vlc,
+                                             (i_prefix_code << i_lookup_bits)|i,
+                                              i_lookup_bits+i_prefix_length );
+            lookup = &table->lookup[i_table_index]; // reallocated
+            lookup[i].i_value = i_index;
+        }
+    }
+
+    return( i_table_index );
+}
+
+
+x264_vlc_table_t *x264_vlc_table_lookup_new( const vlc_t *vlc, int i_vlc, int i_lookup_bits )
+{
+    x264_vlc_table_t *table = x264_malloc( sizeof( x264_vlc_table_t ) );
+
+    table->i_lookup_bits = i_lookup_bits;
+    table->i_lookup = 0;
+    table->lookup   = NULL;
+
+    vlc_table_create_part( table, vlc, i_lookup_bits, i_vlc, 0, 0 );
+
+    return table;
+}
+
+void x264_vlc_table_lookup_delete( x264_vlc_table_t *table )
+{
+    x264_free( table->lookup );
+    x264_free( table );
+}
+
+#if 0
+void x264_vlc_table_lookup_print( x264_vlc_table_t *table )
+{
+    int idx;
+
+    fprintf( stderr, "       " );
+    for( idx = 0; idx < table->i_lookup; idx++ )
+    {
+        if( table->lookup[idx].i_value == -1 )
+        {
+            fprintf( stderr, " MKVLCLU(    -1,  0 )," );
+        }
+        else
+        {
+            fprintf( stderr, " MKVLCLU( 0x%.3x, % 2d ),", table->lookup[idx].i_value, table->lookup[idx].i_size );
+        }
+        if( (idx+1)%4 == 0 && idx < table->i_lookup - 1)
+        {
+            fprintf( stderr, "\n       " );
+        }
+    }
+    fprintf( stderr, "\n" );
+}
+
+int main(void)
+{
+    int i;
+    x264_vlc_table_t *table;
+
+
+    printf( "typedef struct\n    int i_value;\n    int i_size;\n} vlc_lookup_t;\n\n#define MKVLCLU(a,b) { .i_value=a, .i_size=b}" );
+
+    /* create vlc  entry table and then vlc_lookup_t table */
+
+    /* x264_coeff_token */
+    fprintf( stderr, "static const vlc_lookup_t x264_coeff_token_lookup[5][]=\n{\n" );
+    for( i = 0; i < 5; i++ )
+    {
+        fprintf( stderr, "    {\n" );
+        table = x264_vlc_table_lookup_new( x264_coeff_token[i], 17*4, 6 );
+        x264_vlc_table_lookup_print( table );
+        x264_vlc_table_lookup_delete( table );
+        fprintf( stderr, "    },\n" );
+    }
+    fprintf( stderr, "};\n" );
+
+#if 0
+
+    vlce = convert_vlc_to_vlce( x264_level_prefix, 16 );
+    do_vlc_table_create( vlce, 16, "x264_level_prefix_lookup", 8 );
+    free( vlce );
+
+    for( i_table = 0; i_table < 15; i_table++ )
+    {
+        char name[512];
+        vlce = convert_vlc_to_vlce( x264_total_zeros[i_table], 16 );
+        sprintf( name, "x264_total_zeros_%d", i_table );
+        do_vlc_table_create( vlce, 16, name, 6 );
+
+        free( vlce );
+    }
+
+    for( i_table = 0; i_table < 3; i_table++ )
+    {
+        char name[512];
+
+        vlce = convert_vlc_to_vlce( x264_total_zeros_dc[i_table], 4 );
+        sprintf( name, "x264_total_zeros_dc_%d", i_table );
+        do_vlc_table_create( vlce, 4, name, 3 );
+
+        free( vlce );
+    }
+
+    for( i_table = 0; i_table < 7; i_table++ )
+    {
+        char name[512];
+        vlce = convert_vlc_to_vlce( x264_run_before[i_table], 15 );
+        sprintf( name, "x264_run_before_%d", i_table );
+        do_vlc_table_create( vlce, 15, name, 6 );
+
+        free( vlce );
+    }
+#endif
+    return 0;
+}
+
+#endif
diff --git a/decoder/vlc.h b/decoder/vlc.h
new file mode 100644
index 00000000..9529349e
--- /dev/null
+++ b/decoder/vlc.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+ * vlc.h: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_VLC_H
+#define _DECODER_VLC_H 1
+
+typedef struct
+{
+    int i_value;
+    int i_size;
+} vlc_lookup_t;
+
+struct x264_vlc_table_t
+{
+    int          i_lookup_bits;
+
+    int          i_lookup;
+    vlc_lookup_t *lookup;
+};
+
+x264_vlc_table_t *x264_vlc_table_lookup_new( const vlc_t *vlc, int i_vlc, int i_lookup_bits );
+
+void x264_vlc_table_lookup_delete( x264_vlc_table_t *table );
+
+#endif
+
diff --git a/doc/dct.txt b/doc/dct.txt
new file mode 100644
index 00000000..eb0e64f0
--- /dev/null
+++ b/doc/dct.txt
@@ -0,0 +1,111 @@
+/****************************************************************************
+ * DCT/IDCT functions
+ ****************************************************************************/
+/* be carefull that "dct" could be equal to "luma" (ie dct_4x4(dct,dct) )*/
+static void dct_2x2_dc( int16_t dct[2][2], int16_t chroma[2][2] )
+{
+    int tmp[2][2];
+
+    tmp[0][0] = chroma[0][0] + chroma[0][1];
+    tmp[1][0] = chroma[0][0] - chroma[0][1];
+    tmp[0][1] = chroma[1][0] + chroma[1][1];
+    tmp[1][1] = chroma[1][0] - chroma[1][1];
+
+    dct[0][0] = tmp[0][0] + tmp[0][1];
+    dct[0][1] = tmp[1][0] + tmp[1][1];
+    dct[1][0] = tmp[0][0] - tmp[0][1];
+    dct[1][1] = tmp[1][0] - tmp[1][1];
+}
+
+static void idct_2x2_dc( int16_t dct[2][2], int16_t chroma[2][2] )
+{
+    dct_2x2_dc( chroma, dct );
+}
+
+static void dct_4x4_dc( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = luma[i][0] + luma[i][1] + luma[i][2] + luma[i][3];
+        tmp[1][i] = luma[i][0] + luma[i][1] - luma[i][2] - luma[i][3];
+        tmp[2][i] = luma[i][0] - luma[i][1] - luma[i][2] + luma[i][3];
+        tmp[3][i] = luma[i][0] - luma[i][1] + luma[i][2] - luma[i][3];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        dct[0][i] = ( tmp[i][0] + tmp[i][1] + tmp[i][2] + tmp[i][3] + 1) / 2;
+        dct[1][i] = ( tmp[i][0] + tmp[i][1] - tmp[i][2] - tmp[i][3] + 1) / 2;
+        dct[2][i] = ( tmp[i][0] - tmp[i][1] - tmp[i][2] + tmp[i][3] + 1) / 2;
+        dct[3][i] = ( tmp[i][0] - tmp[i][1] + tmp[i][2] - tmp[i][3] + 1) / 2;
+    }
+}
+
+static void dct_4x4( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] =   luma[i][0] +   luma[i][1] +   luma[i][2] +   luma[i][3];
+        tmp[1][i] = 2*luma[i][0] +   luma[i][1] -   luma[i][2] - 2*luma[i][3];
+        tmp[2][i] =   luma[i][0] -   luma[i][1] -   luma[i][2] +   luma[i][3];
+        tmp[3][i] =   luma[i][0] - 2*luma[i][1] + 2*luma[i][2] -   luma[i][3];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        dct[0][i] =   tmp[i][0] +   tmp[i][1] +   tmp[i][2] +   tmp[i][3];
+        dct[1][i] = 2*tmp[i][0] +   tmp[i][1] -   tmp[i][2] - 2*tmp[i][3];
+        dct[2][i] =   tmp[i][0] -   tmp[i][1] -   tmp[i][2] +   tmp[i][3];
+        dct[3][i] =   tmp[i][0] - 2*tmp[i][1] + 2*tmp[i][2] -   tmp[i][3];
+    }
+}
+
+static void idct_4x4( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = dct[0][i]   +  dct[1][i]     + dct[2][i]   + (dct[3][i]>>1);
+        tmp[1][i] = dct[0][i]   + (dct[1][i]>>1) - dct[2][i]   -  dct[3][i];
+        tmp[2][i] = dct[0][i]   - (dct[1][i]>>1) - dct[2][i]   +  dct[3][i];
+        tmp[3][i] = dct[0][i]   -  dct[1][i]     + dct[2][i]   - (dct[3][i]>>1);
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        /* XXX XXX use >> 6 and not / 64 -> doesn't give the same result :((((((((( */
+        luma[i][0] = ( tmp[i][0] +  tmp[i][1]     + tmp[i][2] + (tmp[i][3]>>1) + 32 ) >> 6;
+        luma[i][1] = ( tmp[i][0] + (tmp[i][1]>>1) - tmp[i][2] -  tmp[i][3]     + 32 ) >> 6;
+        luma[i][2] = ( tmp[i][0] - (tmp[i][1]>>1) - tmp[i][2] +  tmp[i][3]     + 32 ) >> 6;
+        luma[i][3] = ( tmp[i][0] -  tmp[i][1]     + tmp[i][2] - (tmp[i][3]>>1) + 32 ) >> 6;
+    }
+}
+static void idct_4x4_dc( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = dct[0][i] + dct[1][i] + dct[2][i] + dct[3][i];
+        tmp[1][i] = dct[0][i] + dct[1][i] - dct[2][i] - dct[3][i];
+        tmp[2][i] = dct[0][i] - dct[1][i] - dct[2][i] + dct[3][i];
+        tmp[3][i] = dct[0][i] - dct[1][i] + dct[2][i] - dct[3][i];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        luma[i][0] = ( tmp[i][0] + tmp[i][1] + tmp[i][2] + tmp[i][3] ) ;
+        luma[i][1] = ( tmp[i][0] + tmp[i][1] - tmp[i][2] - tmp[i][3] ) ;
+        luma[i][2] = ( tmp[i][0] - tmp[i][1] - tmp[i][2] + tmp[i][3] ) ;
+        luma[i][3] = ( tmp[i][0] - tmp[i][1] + tmp[i][2] - tmp[i][3] ) ;
+    }
+}
diff --git a/encoder/analyse.c b/encoder/analyse.c
new file mode 100644
index 00000000..77e2c316
--- /dev/null
+++ b/encoder/analyse.c
@@ -0,0 +1,1224 @@
+/*****************************************************************************
+ * analyse.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "../core/common.h"
+#include "../core/macroblock.h"
+#include "macroblock.h"
+#include "me.h"
+
+typedef struct
+{
+    /* 16x16 */
+    int i_ref;
+    x264_me_t me16x16;
+
+    /* 8x8 */
+    int       i_cost8x8;
+    x264_me_t me8x8[4];
+
+    /* Sub 4x4 */
+    int       i_cost4x4[4]; /* cost per 8x8 partition */
+    x264_me_t me4x4[4][4];
+
+    /* Sub 8x4 */
+    int       i_cost8x4[4]; /* cost per 8x8 partition */
+    x264_me_t me8x4[4][2];
+
+    /* Sub 4x8 */
+    int       i_cost4x8[4]; /* cost per 8x8 partition */
+    x264_me_t me4x8[4][4];
+
+    /* 16x8 */
+    int       i_cost16x8;
+    x264_me_t me16x8[2];
+
+    /* 8x16 */
+    int       i_cost8x16;
+    x264_me_t me8x16[2];
+
+} x264_mb_analysis_list_t;
+
+typedef struct
+{
+    /* conduct the analysis using this lamda and QP */
+    int i_lambda;
+    int i_qp;
+
+
+    /* I: Intra part */
+    /* Luma part 16x16 and 4x4 modes stats */
+    int i_sad_i16x16;
+    int i_predict16x16;
+
+    int i_sad_i4x4;
+    int i_predict4x4[4][4];
+
+    /* Chroma part */
+    int i_sad_i8x8;
+    int i_predict8x8;
+
+    /* II: Inter part P/B frame */
+    int i_mv_range;
+
+    x264_mb_analysis_list_t l0;
+    x264_mb_analysis_list_t l1;
+
+    int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
+
+} x264_mb_analysis_t;
+
+static const int i_qp0_cost_table[52] = {
+   1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
+   1, 1, 1, 1,              /*  8-11 */
+   1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
+   3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
+   6, 7, 8, 9,10,11,13,14,  /* 28-35 */
+  16,18,20,23,25,29,32,36,  /* 36-43 */
+  40,45,51,57,64,72,81,91   /* 44-51 */
+};
+
+static const uint8_t block_idx_x[16] = {
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] = {
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+
+static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
+{
+    memset( a, 0, sizeof( x264_mb_analysis_t ) );
+
+    /* conduct the analysis using this lamda and QP */
+    a->i_qp = i_qp;
+    a->i_lambda = i_qp0_cost_table[i_qp];
+
+    /* I: Intra part */
+    a->i_sad_i16x16 = -1;
+    a->i_sad_i4x4   = -1;
+    a->i_sad_i8x8   = -1;
+
+    /* II: Inter part P/B frame */
+    if( h->sh.i_type != SLICE_TYPE_I )
+    {
+        int dmb;
+        int i;
+
+        /* Calculate max start MV range */
+        dmb = h->mb.i_mb_x;
+        if( h->mb.i_mb_y < dmb )
+            dmb = h->mb.i_mb_y;
+        if( h->sps->i_mb_width - h->mb.i_mb_x < dmb )
+            dmb = h->sps->i_mb_width - h->mb.i_mb_x;
+        if( h->sps->i_mb_height - h->mb.i_mb_y < dmb )
+            dmb = h->sps->i_mb_height - h->mb.i_mb_y;
+
+        a->i_mv_range = 16*dmb + 8;
+
+        a->l0.me16x16.cost = -1;
+        a->l0.i_cost8x8    = -1;
+
+        for( i = 0; i < 4; i++ )
+        {
+            a->l0.i_cost4x4[i] = -1;
+            a->l0.i_cost8x4[i] = -1;
+            a->l0.i_cost4x8[i] = -1;
+        }
+
+        a->l0.i_cost16x8   = -1;
+        a->l0.i_cost8x16   = -1;
+        if( h->sh.i_type == SLICE_TYPE_B )
+        {
+            a->l1.me16x16.cost = -1;
+            a->l1.i_cost8x8    = -1;
+
+            for( i = 0; i < 4; i++ )
+            {
+                a->l1.i_cost4x4[i] = -1;
+                a->l1.i_cost8x4[i] = -1;
+                a->l1.i_cost4x8[i] = -1;
+            }
+
+            a->l1.i_cost16x8   = -1;
+            a->l1.i_cost8x16   = -1;
+
+            a->i_cost16x16bi   = -1;
+        }
+    }
+}
+
+
+
+/*
+ * Handle intra mb
+ */
+/* Max = 4 */
+static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+{
+    if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        /* top and left avaible */
+        *mode++ = I_PRED_16x16_V;
+        *mode++ = I_PRED_16x16_H;
+        *mode++ = I_PRED_16x16_DC;
+        *mode++ = I_PRED_16x16_P;
+        *pi_count = 4;
+    }
+    else if( ( i_neighbour & MB_LEFT ) )
+    {
+        /* left available*/
+        *mode++ = I_PRED_16x16_DC_LEFT;
+        *mode++ = I_PRED_16x16_H;
+        *pi_count = 2;
+    }
+    else if( ( i_neighbour & MB_TOP ) )
+    {
+        /* top available*/
+        *mode++ = I_PRED_16x16_DC_TOP;
+        *mode++ = I_PRED_16x16_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        /* none avaible */
+        *mode = I_PRED_16x16_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/* Max = 4 */
+static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+{
+    if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        /* top and left avaible */
+        *mode++ = I_PRED_CHROMA_V;
+        *mode++ = I_PRED_CHROMA_H;
+        *mode++ = I_PRED_CHROMA_DC;
+        *mode++ = I_PRED_CHROMA_P;
+        *pi_count = 4;
+    }
+    else if( ( i_neighbour & MB_LEFT ) )
+    {
+        /* left available*/
+        *mode++ = I_PRED_CHROMA_DC_LEFT;
+        *mode++ = I_PRED_CHROMA_H;
+        *pi_count = 2;
+    }
+    else if( ( i_neighbour & MB_TOP ) )
+    {
+        /* top available*/
+        *mode++ = I_PRED_CHROMA_DC_TOP;
+        *mode++ = I_PRED_CHROMA_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        /* none avaible */
+        *mode = I_PRED_CHROMA_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/* MAX = 8 */
+static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *mode, int *pi_count )
+{
+    int b_a, b_b, b_c;
+    static const unsigned int needmb[16] =
+    {
+        MB_LEFT|MB_TOP, MB_TOP,
+        MB_LEFT,        MB_PRIVATE,
+        MB_TOP,         MB_TOP|MB_TOPRIGHT,
+        0,              MB_PRIVATE,
+        MB_LEFT,        0,
+        MB_LEFT,        MB_PRIVATE,
+        0,              MB_PRIVATE,
+        0,              MB_PRIVATE
+    };
+
+    /* FIXME even when b_c == 0 there is some case where missing pixels
+     * are emulated and thus more mode are available TODO
+     * analysis and encode should be fixed too */
+    b_a = (needmb[idx]&i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
+    b_b = (needmb[idx]&i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
+    b_c = (needmb[idx]&i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
+
+    if( b_a && b_b )
+    {
+        *mode++ = I_PRED_4x4_DC;
+        *mode++ = I_PRED_4x4_H;
+        *mode++ = I_PRED_4x4_V;
+        *mode++ = I_PRED_4x4_DDR;
+        *mode++ = I_PRED_4x4_VR;
+        *mode++ = I_PRED_4x4_HD;
+        *mode++ = I_PRED_4x4_HU;
+
+        *pi_count = 7;
+
+        if( b_c )
+        {
+            *mode++ = I_PRED_4x4_DDL;
+            *mode++ = I_PRED_4x4_VL;
+            (*pi_count) += 2;
+        }
+    }
+    else if( b_a && !b_b )
+    {
+        *mode++ = I_PRED_4x4_DC_LEFT;
+        *mode++ = I_PRED_4x4_H;
+        *pi_count = 2;
+    }
+    else if( !b_a && b_b )
+    {
+        *mode++ = I_PRED_4x4_DC_TOP;
+        *mode++ = I_PRED_4x4_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        *mode++ = I_PRED_4x4_DC_128;
+        *pi_count = 1;
+    }
+}
+
+static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res )
+{
+    const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
+    const int i_stride = h->mb.pic.i_stride[0];
+    uint8_t  *p_src = h->mb.pic.p_fenc[0];
+    uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+
+    int i, idx;
+
+    int i_max;
+    int predict_mode[9];
+
+    /*---------------- Try all mode and calculate their score ---------------*/
+
+    /* 16x16 prediction selection */
+    predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_16x16[i_mode]( p_dst, i_stride );
+
+        /* we calculate the diff and get the square sum of the diff */
+        i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
+        /* if i_score is lower it is better */
+        if( res->i_sad_i16x16 == -1 || res->i_sad_i16x16 > i_sad )
+        {
+            res->i_predict16x16 = i_mode;
+            res->i_sad_i16x16     = i_sad;
+        }
+    }
+
+    /* 4x4 prediction selection */
+    if( flags & X264_ANALYSE_I4x4 )
+    {
+        res->i_sad_i4x4 = 0;
+        for( idx = 0; idx < 16; idx++ )
+        {
+            uint8_t *p_src_by;
+            uint8_t *p_dst_by;
+            int     i_best;
+            int x, y;
+            int i_pred_mode;
+
+            i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
+            x = block_idx_x[idx];
+            y = block_idx_y[idx];
+
+            p_src_by = p_src + 4 * x + 4 * y * i_stride;
+            p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
+
+            i_best = -1;
+            predict_4x4_mode_available( h->mb.i_neighbour, idx, predict_mode, &i_max );
+            for( i = 0; i < i_max; i++ )
+            {
+                int i_sad;
+                int i_mode;
+
+                i_mode = predict_mode[i];
+
+                /* we do the prediction */
+                h->predict_4x4[i_mode]( p_dst_by, i_stride );
+
+                /* we calculate diff and get the square sum of the diff */
+                i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_stride,
+                                                 p_src_by, i_stride );
+
+                i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
+
+                /* if i_score is lower it is better */
+                if( i_best == -1 || i_best > i_sad )
+                {
+                    res->i_predict4x4[x][y] = i_mode;
+                    i_best = i_sad;
+                }
+            }
+            res->i_sad_i4x4 += i_best;
+
+            /* we need to encode this mb now (for next ones) */
+            h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
+            x264_mb_encode_i4x4( h, idx, res->i_qp );
+
+            /* we need to store the 'fixed' version */
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] =
+                x264_mb_pred_mode4x4_fix[res->i_predict4x4[x][y]];
+        }
+        res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
+    }
+}
+
+static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
+{
+    int i;
+
+    int i_max;
+    int predict_mode[9];
+
+    uint8_t *p_dstc[2], *p_srcc[2];
+    int      i_stride[2];
+
+    /* 8x8 prediction selection for chroma */
+    p_dstc[0] = h->mb.pic.p_fdec[1];
+    p_dstc[1] = h->mb.pic.p_fdec[2];
+    p_srcc[0] = h->mb.pic.p_fenc[1];
+    p_srcc[1] = h->mb.pic.p_fenc[2];
+
+    i_stride[0] = h->mb.pic.i_stride[1];
+    i_stride[1] = h->mb.pic.i_stride[2];
+
+    predict_8x8_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+    res->i_sad_i8x8 = -1;
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_8x8[i_mode]( p_dstc[0], i_stride[0] );
+        h->predict_8x8[i_mode]( p_dstc[1], i_stride[1] );
+
+        /* we calculate the cost */
+        i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
+                                         p_srcc[0], i_stride[0] ) +
+                h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
+                                         p_srcc[1], i_stride[1] ) +
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix[i_mode] );
+
+        /* if i_score is lower it is better */
+        if( res->i_sad_i8x8 == -1 || res->i_sad_i8x8 > i_sad )
+        {
+            res->i_predict8x8 = i_mode;
+            res->i_sad_i8x8     = i_sad;
+        }
+    }
+}
+
+static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
+{
+    x264_me_t m;
+    int i_ref;
+
+    /* 16x16 Search on all ref frame */
+    m.i_pixel = PIXEL_16x16;
+    m.lm      = a->i_lambda;
+    m.p_fenc  = h->mb.pic.p_fenc[0];
+    m.i_stride= h->mb.pic.i_stride[0];
+    m.i_mv_range = a->i_mv_range;
+    m.b_mvc   = 0;
+//    m.mvc[0]  = 0;
+//    m.mvc[1]  = 0;
+
+    /* ME for ref 0 */
+    m.p_fref = h->mb.pic.p_fref[0][0][0];
+    x264_mb_predict_mv_16x16( h, 0, 0, m.mvp );
+    x264_me_search( h, &m );
+
+    a->l0.i_ref = 0;
+    a->l0.me16x16 = m;
+
+    for( i_ref = 1; i_ref < h->i_ref0; i_ref++ )
+    {
+        /* search with ref */
+        m.p_fref = h->mb.pic.p_fref[0][i_ref][0];
+        x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
+        x264_me_search( h, &m );
+
+        /* add ref cost */
+        m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+
+        if( m.cost < a->l0.me16x16.cost )
+        {
+            a->l0.i_ref = i_ref;
+            a->l0.me16x16 = m;
+        }
+    }
+
+    /* Set global ref, needed for all others modes */
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+}
+
+static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    for( i = 0; i < 4; i++ )
+    {
+        x264_me_t *m = &a->l0.me8x8[i];
+        const int x8 = i%2;
+        const int y8 = i/2;
+
+        m->i_pixel = PIXEL_8x8;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc = &p_fenc[8*(y8*h->mb.pic.i_stride[0]+x8)];
+        m->p_fref = &p_fref[8*(y8*h->mb.pic.i_stride[0]+x8)];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        if( i == 0 )
+        {
+            m->b_mvc   = 1;
+            m->mvc[0] = a->l0.me16x16.mv[0];
+            m->mvc[1] = a->l0.me16x16.mv[1];
+        }
+        else
+        {
+            m->b_mvc   = 0;
+        }
+
+        x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
+                   a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+}
+
+static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_16x8;
+
+    for( i = 0; i < 2; i++ )
+    {
+        x264_me_t *m = &a->l0.me16x8[i];
+
+        m->i_pixel = PIXEL_16x8;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc = &p_fenc[8*i*h->mb.pic.i_stride[0]];
+        m->p_fref = &p_fref[8*i*h->mb.pic.i_stride[0]];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        m->b_mvc   = 1;
+        m->mvc[0] = a->l0.me8x8[2*i].mv[0];
+        m->mvc[1] = a->l0.me8x8[2*i].mv[1];
+
+        x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
+}
+
+static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x16;
+
+    for( i = 0; i < 2; i++ )
+    {
+        x264_me_t *m = &a->l0.me8x16[i];
+
+        m->i_pixel = PIXEL_8x16;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc  = &p_fenc[8*i];
+        m->p_fref  = &p_fref[8*i];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        m->b_mvc   = 1;
+        m->mvc[0] = a->l0.me8x8[i].mv[0];
+        m->mvc[1] = a->l0.me8x8[i].mv[1];
+
+        x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
+}
+
+static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i4x4;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    for( i4x4 = 0; i4x4 < 4; i4x4++ )
+    {
+        const int idx = 4*i8x8 + i4x4;
+        const int x4 = block_idx_x[idx];
+        const int y4 = block_idx_y[idx];
+
+        x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
+
+        m->i_pixel = PIXEL_4x4;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc  = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->p_fref  = &p_fref[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        if( i4x4 == 0 )
+        {
+            m->b_mvc   = 1;
+            m->mvc[0] = a->l0.me8x8[i8x8].mv[0];
+            m->mvc[1] = a->l0.me8x8[i8x8].mv[1];
+        }
+        else
+        {
+            m->b_mvc   = 0;
+        }
+
+        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
+                         a->l0.me4x4[i8x8][1].cost +
+                         a->l0.me4x4[i8x8][2].cost +
+                         a->l0.me4x4[i8x8][3].cost;
+}
+
+static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i8x4;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    for( i8x4 = 0; i8x4 < 2; i8x4++ )
+    {
+        const int idx = 4*i8x8 + 2*i8x4;
+        const int x4 = block_idx_x[idx];
+        const int y4 = block_idx_y[idx];
+
+        x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
+
+        m->i_pixel = PIXEL_8x4;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc  = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->p_fref  = &p_fref[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        if( i8x4 == 0 )
+        {
+            m->b_mvc   = 1;
+            m->mvc[0] = a->l0.me4x4[i8x8][0].mv[0];
+            m->mvc[1] = a->l0.me4x4[i8x8][0].mv[1];
+        }
+        else
+        {
+            m->b_mvc   = 0;
+        }
+
+        x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost;
+}
+
+static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
+{
+    uint8_t  *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
+    uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
+
+    int i4x8;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    for( i4x8 = 0; i4x8 < 2; i4x8++ )
+    {
+        const int idx = 4*i8x8 + i4x8;
+        const int x4 = block_idx_x[idx];
+        const int y4 = block_idx_y[idx];
+
+        x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
+
+        m->i_pixel = PIXEL_4x8;
+        m->lm      = a->i_lambda;
+
+        m->p_fenc  = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->p_fref  = &p_fref[4*(y4*h->mb.pic.i_stride[0]+x4)];
+        m->i_stride= h->mb.pic.i_stride[0];
+        m->i_mv_range = a->i_mv_range;
+
+        if( i4x8 == 0 )
+        {
+            m->b_mvc   = 1;
+            m->mvc[0] = a->l0.me4x4[i8x8][0].mv[0];
+            m->mvc[1] = a->l0.me4x4[i8x8][0].mv[1];
+        }
+        else
+        {
+            m->b_mvc   = 0;
+        }
+
+        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
+        x264_me_search( h, m );
+
+        x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
+    }
+
+    a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost;
+}
+
+
+static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t pix1[16*16], pix2[16*16];
+
+    x264_me_t m;
+    int i_ref;
+
+    /* 16x16 Search on all ref frame */
+    m.i_pixel = PIXEL_16x16;
+    m.lm      = a->i_lambda;
+    m.p_fenc  = h->mb.pic.p_fenc[0];
+    m.i_stride= h->mb.pic.i_stride[0];
+    m.b_mvc   = 0;
+    m.i_mv_range = a->i_mv_range;
+
+    /* ME for List 0 ref 0 */
+    m.p_fref = h->mb.pic.p_fref[0][0][0];
+    x264_mb_predict_mv_16x16( h, 0, 0, m.mvp );
+    x264_me_search( h, &m );
+
+    a->l0.i_ref = 0;
+    a->l0.me16x16 = m;
+
+    for( i_ref = 1; i_ref < h->i_ref0; i_ref++ )
+    {
+        /* search with ref */
+        m.p_fref = h->mb.pic.p_fref[0][i_ref][0];
+        x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
+        x264_me_search( h, &m );
+
+        /* add ref cost */
+        m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+
+        if( m.cost < a->l0.me16x16.cost )
+        {
+            a->l0.i_ref = i_ref;
+            a->l0.me16x16 = m;
+        }
+    }
+
+    /* ME for list 1 ref 0 */
+    m.p_fref = h->mb.pic.p_fref[1][0][0];
+    x264_mb_predict_mv_16x16( h, 1, 0, m.mvp );
+    x264_me_search( h, &m );
+
+    a->l1.i_ref = 0;
+    a->l1.me16x16 = m;
+
+    for( i_ref = 1; i_ref < h->i_ref1; i_ref++ )
+    {
+        /* search with ref */
+        m.p_fref = h->mb.pic.p_fref[1][i_ref][0];
+        x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
+        x264_me_search( h, &m );
+
+        /* add ref cost */
+        m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
+
+        if( m.cost < a->l1.me16x16.cost )
+        {
+            a->l1.i_ref = i_ref;
+            a->l1.me16x16 = m;
+        }
+    }
+
+    /* Set global ref, needed for all others modes FIXME some work for mixed block mode */
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
+
+    /* get cost of BI mode */
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
+                    pix1, 16,
+                    a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
+                    16, 16 );
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
+                    pix2, 16,
+                    a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
+                    16, 16 );
+    h->pixf.avg[PIXEL_16x16]( pix1, 16, pix2, 16 );
+
+    a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 ) +
+                       a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ) +
+                                       bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) +
+                                       bs_size_se( a->l0.me16x16.mv[0] - a->l0.me16x16.mvp[0] ) +
+                                       bs_size_se( a->l0.me16x16.mv[1] - a->l0.me16x16.mvp[1] ) +
+                                       bs_size_se( a->l1.me16x16.mv[0] - a->l1.me16x16.mvp[0] ) +
+                                       bs_size_se( a->l1.me16x16.mv[1] - a->l1.me16x16.mvp[1] ) );
+}
+
+/*****************************************************************************
+ * x264_macroblock_analyse:
+ *****************************************************************************/
+void x264_macroblock_analyse( x264_t *h )
+{
+    x264_mb_analysis_t analysis;
+    int i;
+
+    /* qp TODO */
+    h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->pps->i_pic_init_qp + h->sh.i_qp_delta + 0, 0, 51 );
+
+    /* init analysis */
+    x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
+
+    /*--------------------------- Do the analysis ---------------------------*/
+    if( h->sh.i_type == SLICE_TYPE_I )
+    {
+        x264_mb_analyse_intra( h, &analysis );
+
+        if( analysis.i_sad_i4x4 >= 0 &&  analysis.i_sad_i4x4 < analysis.i_sad_i16x16 )
+            h->mb.i_type = I_4x4;
+        else
+            h->mb.i_type = I_16x16;
+    }
+    else if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        const unsigned int i_neighbour = h->mb.i_neighbour;
+
+        int b_skip = 0;
+        int i_cost;
+
+        /* Fast P_SKIP detection */
+        if( analysis.i_qp == h->mb.i_last_qp &&
+            ( ( (i_neighbour&MB_LEFT) && h->mb.type[h->mb.i_mb_xy - 1] == P_SKIP ) ||
+              ( (i_neighbour&MB_TOP) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] == P_SKIP ) ||
+              ( ((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT) ) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] == P_SKIP ) ||
+              ( (i_neighbour&MB_TOPRIGHT) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] == P_SKIP ) ) )
+        {
+            b_skip = x264_macroblock_probe_pskip( h );
+        }
+
+        if( b_skip )
+        {
+            h->mb.i_type = P_SKIP;
+            h->mb.i_partition = D_16x16;
+        }
+        else
+        {
+            const unsigned int flags = h->param.analyse.inter;
+            int i_type;
+            int i_partition;
+
+            x264_mb_analyse_inter_p16x16( h, &analysis );
+            if( flags & X264_ANALYSE_PSUB16x16 )
+                x264_mb_analyse_inter_p8x8( h, &analysis );
+
+            /* Select best inter mode */
+            i_type = P_L0;
+            i_partition = D_16x16;
+            i_cost = analysis.l0.me16x16.cost;
+
+            if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
+                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
+            {
+                int i;
+
+                i_type = P_8x8;
+                i_partition = D_8x8;
+                h->mb.i_sub_partition[0] = D_L0_8x8;
+                h->mb.i_sub_partition[1] = D_L0_8x8;
+                h->mb.i_sub_partition[2] = D_L0_8x8;
+                h->mb.i_sub_partition[3] = D_L0_8x8;
+
+                i_cost = analysis.l0.i_cost8x8;
+
+                /* Do sub 8x8 */
+                if( flags & X264_ANALYSE_PSUB8x8 )
+                {
+                    for( i = 0; i < 4; i++ )
+                    {
+                        x264_mb_analyse_inter_p4x4( h, &analysis, i );
+                        if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
+                        {
+                            int i_cost8x8;
+
+                            h->mb.i_sub_partition[i] = D_L0_4x4;
+                            i_cost8x8 = analysis.l0.i_cost4x4[i];
+
+                            x264_mb_analyse_inter_p8x4( h, &analysis, i );
+                            if( analysis.l0.i_cost8x4[i] < analysis.l0.i_cost4x4[i] )
+                            {
+                                h->mb.i_sub_partition[i] = D_L0_8x4;
+                                i_cost8x8 = analysis.l0.i_cost8x4[i];
+                            }
+
+                            x264_mb_analyse_inter_p4x8( h, &analysis, i );
+                            if( analysis.l0.i_cost4x8[i] < analysis.l0.i_cost4x4[i] )
+                            {
+                                h->mb.i_sub_partition[i] = D_L0_4x8;
+                                i_cost8x8 = analysis.l0.i_cost4x8[i];
+                            }
+
+                            i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
+                        }
+                    }
+                }
+
+                /* Now do sub 16x8/8x16 */
+                x264_mb_analyse_inter_p16x8( h, &analysis );
+                if( analysis.l0.i_cost16x8 < i_cost )
+                {
+                    i_type = P_L0;
+                    i_partition = D_16x8;
+                    i_cost = analysis.l0.i_cost16x8;
+                }
+
+                x264_mb_analyse_inter_p8x16( h, &analysis );
+                if( analysis.l0.i_cost8x16 < i_cost )
+                {
+                    i_type = P_L0;
+                    i_partition = D_8x16;
+                    i_cost = analysis.l0.i_cost8x16;
+                }
+            }
+
+            h->mb.i_type = i_type;
+            h->mb.i_partition = i_partition;
+
+            /* refine qpel */
+            if( h->mb.i_partition == D_16x16 )
+            {
+                x264_me_refine_qpel( h, &analysis.l0.me16x16 );
+                i_cost = analysis.l0.me16x16.cost;
+            }
+            else if( h->mb.i_partition == D_16x8 )
+            {
+                x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
+                x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
+                i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
+            }
+            else if( h->mb.i_partition == D_8x16 )
+            {
+                x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
+                x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
+                i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
+            }
+            else if( h->mb.i_partition == D_8x8 )
+            {
+                int i8x8;
+                i_cost = 0;
+                for( i8x8 = 0; i8x8 < 4; i8x8++ )
+                {
+                    switch( h->mb.i_sub_partition[i8x8] )
+                    {
+                        case D_L0_8x8:
+                            x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
+                            i_cost += analysis.l0.me8x8[i8x8].cost;
+                            break;
+                        case D_L0_8x4:
+                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
+                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
+                            i_cost += analysis.l0.me8x4[i8x8][0].cost +
+                                      analysis.l0.me8x4[i8x8][1].cost;
+                            break;
+                        case D_L0_4x8:
+                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
+                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
+                            i_cost += analysis.l0.me4x8[i8x8][0].cost +
+                                      analysis.l0.me4x8[i8x8][1].cost;
+                            break;
+
+                        case D_L0_4x4:
+                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
+                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
+                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
+                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
+                            i_cost += analysis.l0.me4x4[i8x8][0].cost +
+                                      analysis.l0.me4x4[i8x8][1].cost +
+                                      analysis.l0.me4x4[i8x8][2].cost +
+                                      analysis.l0.me4x4[i8x8][3].cost;
+                            break;
+                        default:
+                            fprintf( stderr, "internal error (!8x8 && !4x4)" );
+                            break;
+                    }
+                }
+            }
+
+            x264_mb_analyse_intra( h, &analysis );
+            if( analysis.i_sad_i16x16 >= 0 && analysis.i_sad_i16x16 < i_cost )
+            {
+                h->mb.i_type = I_16x16;
+                i_cost = analysis.i_sad_i16x16;
+            }
+
+            if( analysis.i_sad_i4x4 >=0 && analysis.i_sad_i4x4 < i_cost )
+            {
+                h->mb.i_type = I_4x4;
+                i_cost = analysis.i_sad_i4x4;
+            }
+        }
+    }
+    else if( h->sh.i_type == SLICE_TYPE_B )
+    {
+        int i_cost;
+
+        /* best inter mode */
+        x264_mb_analyse_inter_b16x16( h, &analysis );
+        h->mb.i_type = B_L0_L0;
+        h->mb.i_partition = D_16x16;
+        i_cost = analysis.l0.me16x16.cost;
+
+        if( analysis.l1.me16x16.cost < i_cost )
+        {
+            h->mb.i_type = B_L1_L1;
+            i_cost = analysis.l1.me16x16.cost;
+        }
+        if( analysis.i_cost16x16bi < i_cost )
+        {
+            h->mb.i_type = B_BI_BI;
+            i_cost = analysis.i_cost16x16bi;
+        }
+
+        /* best intra mode */
+        x264_mb_analyse_intra( h, &analysis );
+        if( analysis.i_sad_i16x16 >= 0 && analysis.i_sad_i16x16 < i_cost )
+        {
+            h->mb.i_type = I_16x16;
+            i_cost = analysis.i_sad_i16x16;
+        }
+        if( analysis.i_sad_i4x4 >=0 && analysis.i_sad_i4x4 < i_cost )
+        {
+            h->mb.i_type = I_4x4;
+            i_cost = analysis.i_sad_i4x4;
+        }
+    }
+#undef BEST_TYPE
+
+    /*-------------------- Update MB from the analysis ----------------------*/
+    h->mb.type[h->mb.i_mb_xy] = h->mb.i_type;
+    switch( h->mb.i_type )
+    {
+        case I_4x4:
+            for( i = 0; i < 16; i++ )
+            {
+                h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
+                    analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
+            }
+
+            x264_mb_analyse_intra_chroma( h, &analysis );
+            h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+        case I_16x16:
+            h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
+
+            x264_mb_analyse_intra_chroma( h, &analysis );
+            h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+
+        case P_L0:
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+                    break;
+
+                case D_16x8:
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].mv[0], analysis.l0.me16x8[0].mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].mv[0], analysis.l0.me16x8[1].mv[1] );
+                    break;
+
+                case D_8x16:
+                    x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].mv[0], analysis.l0.me8x16[0].mv[1] );
+                    x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].mv[0], analysis.l0.me8x16[1].mv[1] );
+                    break;
+
+                default:
+                    fprintf( stderr, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
+                    break;
+            }
+            break;
+
+        case P_8x8:
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+            for( i = 0; i < 4; i++ )
+            {
+                const int x = 2*(i%2);
+                const int y = 2*(i/2);
+
+                switch( h->mb.i_sub_partition[i] )
+                {
+                    case D_L0_8x8:
+                        x264_macroblock_cache_mv( h, x, y, 2, 2, 0, analysis.l0.me8x8[i].mv[0], analysis.l0.me8x8[i].mv[1] );
+                        break;
+                    case D_L0_8x4:
+                        x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, analysis.l0.me8x4[i][0].mv[0], analysis.l0.me8x4[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, analysis.l0.me8x4[i][1].mv[0], analysis.l0.me8x4[i][1].mv[1] );
+                        break;
+                    case D_L0_4x8:
+                        x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, analysis.l0.me4x8[i][0].mv[0], analysis.l0.me4x8[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, analysis.l0.me4x8[i][1].mv[0], analysis.l0.me4x8[i][1].mv[1] );
+                        break;
+                    case D_L0_4x4:
+                        x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, analysis.l0.me4x4[i][0].mv[0], analysis.l0.me4x4[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, analysis.l0.me4x4[i][1].mv[0], analysis.l0.me4x4[i][1].mv[1] );
+                        x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, analysis.l0.me4x4[i][2].mv[0], analysis.l0.me4x4[i][2].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, analysis.l0.me4x4[i][3].mv[0], analysis.l0.me4x4[i][3].mv[1] );
+                        break;
+                    default:
+                        fprintf( stderr, "internal error\n" );
+                        break;
+                }
+            }
+            break;
+
+        case P_SKIP:
+        {
+            int mvp[2];
+            x264_mb_predict_mv_pskip( h, mvp );
+            /* */
+            h->mb.i_partition = D_16x16;
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
+            x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, mvp[0], mvp[1] );
+            break;
+        }
+
+        case B_L0_L0:
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
+                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1,  0, 0 );
+                    break;
+                default:
+                    fprintf( stderr, "internal error\n" );
+                    break;
+            }
+            break;
+        case B_L1_L1:
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
+                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
+
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
+                    break;
+
+                default:
+                    fprintf( stderr, "internal error\n" );
+                    break;
+            }
+            break;
+        case B_BI_BI:
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
+                    break;
+
+                default:
+                    fprintf( stderr, "internal error\n" );
+                    break;
+            }
+            break;
+
+        default:
+            fprintf( stderr, "internal error (invalid MB type)\n" );
+            break;
+    }
+}
+
diff --git a/encoder/analyse.h b/encoder/analyse.h
new file mode 100644
index 00000000..8e591e89
--- /dev/null
+++ b/encoder/analyse.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * analyse.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: analyse.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ANALYSE_H
+#define _ANALYSE_H 1
+
+void x264_macroblock_analyse( x264_t *h );
+
+#endif
diff --git a/encoder/cabac.c b/encoder/cabac.c
new file mode 100644
index 00000000..8b1f3965
--- /dev/null
+++ b/encoder/cabac.c
@@ -0,0 +1,1195 @@
+/*****************************************************************************
+ * cabac.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cabac.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "macroblock.h"
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static void x264_cabac_mb_type( x264_t *h )
+{
+    const int i_mb_type = h->mb.i_type;
+
+    if( h->sh.i_type == SLICE_TYPE_I )
+    {
+        int ctx = 0;
+        if( h->mb.i_mb_x > 0 && h->mb.type[h->mb.i_mb_xy - 1] != I_4x4 )
+        {
+            ctx++;
+        }
+        if( h->mb.i_mb_y > 0 && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] != I_4x4 )
+        {
+            ctx++;
+        }
+
+        if( i_mb_type == I_4x4 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + ctx, 0 );
+        }
+        else if( i_mb_type == I_PCM )
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + ctx, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 1 );
+        }
+        else    /* I_16x16 */
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + ctx, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 0 );
+
+            x264_cabac_encode_decision( &h->cabac, 3 + 3, ( h->mb.i_cbp_luma == 0 ? 0 : 1 ));
+            if( h->mb.i_cbp_chroma == 0 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 3 + 4, 0 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 3 + 4, 1 );
+                x264_cabac_encode_decision( &h->cabac, 3 + 5, ( h->mb.i_cbp_chroma == 1 ? 0 : 1 ) );
+            }
+            x264_cabac_encode_decision( &h->cabac, 3 + 6, ( (h->mb.i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+            x264_cabac_encode_decision( &h->cabac, 3 + 7, ( (h->mb.i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        }
+    }
+    else if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        /* prefix: 14, suffix: 17 */
+        if( i_mb_type == P_L0 )
+        {
+            if( h->mb.i_partition == D_16x16 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 0 );
+                x264_cabac_encode_decision( &h->cabac, 16, 0 );
+            }
+            else if( h->mb.i_partition == D_16x8 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            }
+            else if( h->mb.i_partition == D_8x16 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17, 0 );
+            }
+        }
+        else if( i_mb_type == P_8x8 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 14, 0 );
+            x264_cabac_encode_decision( &h->cabac, 15, 0 );
+            x264_cabac_encode_decision( &h->cabac, 16, 1 );
+        }
+        else if( i_mb_type == I_4x4 )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            x264_cabac_encode_decision( &h->cabac, 17, 0 );
+        }
+        else if( i_mb_type == I_PCM )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 1 ); /*ctxIdx == 276 */
+        }
+        else /* intra 16x16 */
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            /* suffix */
+            x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 0 ); /*ctxIdx == 276 */
+
+            x264_cabac_encode_decision( &h->cabac, 17+1, ( h->mb.i_cbp_luma == 0 ? 0 : 1 ));
+            if( h->mb.i_cbp_chroma == 0 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 17+2, 0 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 17+2, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17+2, ( h->mb.i_cbp_chroma == 1 ? 0 : 1 ) );
+            }
+            x264_cabac_encode_decision( &h->cabac, 17+3, ( (h->mb.i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+            x264_cabac_encode_decision( &h->cabac, 17+3, ( (h->mb.i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        }
+    }
+    else if( h->sh.i_type == SLICE_TYPE_B )
+    {
+        int ctx = 0;
+        if( h->mb.i_mb_x > 0 && h->mb.type[h->mb.i_mb_xy - 1] != B_SKIP && h->mb.type[h->mb.i_mb_xy - 1] != B_DIRECT )
+        {
+            ctx++;
+        }
+        if( h->mb.i_mb_y > 0 && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] != B_SKIP && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] != B_DIRECT )
+        {
+            ctx++;
+        }
+
+        if( i_mb_type == B_DIRECT )
+        {
+            x264_cabac_encode_decision( &h->cabac, 27+ctx, 0 );
+        }
+        else if( i_mb_type == B_8x8 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 27+ctx, 1 );
+            x264_cabac_encode_decision( &h->cabac, 27+3,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+4,   1 );
+
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+        }
+        else if( IS_INTRA( i_mb_type ) )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 27+ctx, 1 );
+            x264_cabac_encode_decision( &h->cabac, 27+3,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+4,   1 );
+
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+            x264_cabac_encode_decision( &h->cabac, 27+5,   0 );
+            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+
+            /* Suffix */
+            if( i_mb_type == I_4x4 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 32, 0 );
+            }
+            else if( i_mb_type == I_PCM )
+            {
+                x264_cabac_encode_decision( &h->cabac, 32, 1 );
+                x264_cabac_encode_terminal( &h->cabac,     1 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 32, 1 );
+                x264_cabac_encode_terminal( &h->cabac,     0 );
+
+                /* TODO */
+                x264_cabac_encode_decision( &h->cabac, 32+1, ( h->mb.i_cbp_luma == 0 ? 0 : 1 ));
+                if( h->mb.i_cbp_chroma == 0 )
+                {
+                    x264_cabac_encode_decision( &h->cabac, 32+2, 0 );
+                }
+                else
+                {
+                    x264_cabac_encode_decision( &h->cabac, 32+2, 1 );
+                    x264_cabac_encode_decision( &h->cabac, 32+2, ( h->mb.i_cbp_chroma == 1 ? 0 : 1 ) );
+                }
+                x264_cabac_encode_decision( &h->cabac, 32+3, ( (h->mb.i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+                x264_cabac_encode_decision( &h->cabac, 32+3, ( (h->mb.i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+            }
+        }
+        else
+        {
+            static const int i_mb_len[21] =
+            {
+                3, 6, 6,    /* L0 L0 */
+                3, 6, 6,    /* L1 L1 */
+                6, 7, 7,    /* BI BI */
+
+                6, 6,       /* L0 L1 */
+                6, 6,       /* L1 L0 */
+                7, 7,       /* L0 BI */
+                7, 7,       /* L1 BI */
+                7, 7,       /* BI L0 */
+                7, 7,       /* BI L1 */
+            };
+            static const int i_mb_bits[21][7] =
+            {
+                { 1, 0, 0, },            { 1, 1, 0, 0, 0, 1, },    { 1, 1, 0, 0, 1, 0, },   /* L0 L0 */
+                { 1, 0, 1, },            { 1, 1, 0, 0, 1, 1, },    { 1, 1, 0, 1, 0, 0, },   /* L1 L1 */
+                { 1, 1, 0, 0, 0, 0 ,},   { 1, 1, 1, 1, 0, 0 , 0 }, { 1, 1, 1, 1, 0, 0 , 1 },/* BI BI */
+
+                { 1, 1, 0, 1, 0, 1, },   { 1, 1, 0, 1, 1, 0, },     /* L0 L1 */
+                { 1, 1, 0, 1, 1, 1, },   { 1, 1, 1, 1, 1, 0, },     /* L1 L0 */
+                { 1, 1, 1, 0, 0, 0, 0 }, { 1, 1, 1, 0, 0, 0, 1 },   /* L0 BI */
+                { 1, 1, 1, 0, 0, 1, 0 }, { 1, 1, 1, 0, 0, 1, 1 },   /* L1 BI */
+                { 1, 1, 1, 0, 1, 0, 0 }, { 1, 1, 1, 0, 1, 0, 1 },   /* BI L0 */
+                { 1, 1, 1, 0, 1, 1, 0 }, { 1, 1, 1, 0, 1, 1, 1 }    /* BI L1 */
+            };
+
+            const int i_partition = h->mb.i_partition;
+            int idx = 0;
+            int i;
+            switch( i_mb_type )
+            {
+                /* D_16x16, D_16x8, D_8x16 */
+                case B_BI_BI: idx += 3;
+                case B_L1_L1: idx += 3;
+                case B_L0_L0:
+                    if( i_partition == D_16x8 )
+                        idx += 1;
+                    else if( i_partition == D_8x16 )
+                        idx += 2;
+                    break;
+
+                /* D_16x8, D_8x16 */
+                case B_BI_L1: idx += 2;
+                case B_BI_L0: idx += 2;
+                case B_L1_BI: idx += 2;
+                case B_L0_BI: idx += 2;
+                case B_L1_L0: idx += 2;
+                case B_L0_L1:
+                    idx += 3*3;
+                    if( i_partition == D_8x16 )
+                        idx++;
+                    break;
+                default:
+                    fprintf( stderr, "error in B mb type\n" );
+                    return;
+            }
+
+            x264_cabac_encode_decision( &h->cabac, 27+ctx,                         i_mb_bits[idx][0] );
+            x264_cabac_encode_decision( &h->cabac, 27+3,                           i_mb_bits[idx][1] );
+            x264_cabac_encode_decision( &h->cabac, 27+(i_mb_bits[idx][1] != 0 ? 4 : 5), i_mb_bits[idx][2] );
+            for( i = 3; i < i_mb_len[idx]; i++ )
+            {
+                x264_cabac_encode_decision( &h->cabac, 27+5,                       i_mb_bits[idx][i] );
+            }
+        }
+    }
+    else
+    {
+        fprintf( stderr, "unknown SLICE_TYPE unsupported in x264_macroblock_write_cabac\n" );
+    }
+}
+
+static void x264_cabac_mb_intra4x4_pred_mode( x264_t *h, int i_pred, int i_mode )
+{
+    if( i_pred == i_mode )
+    {
+        /* b_prev_intra4x4_pred_mode */
+        x264_cabac_encode_decision( &h->cabac, 68, 1 );
+    }
+    else
+    {
+        /* b_prev_intra4x4_pred_mode */
+        x264_cabac_encode_decision( &h->cabac, 68, 0 );
+        if( i_mode > i_pred  )
+        {
+            i_mode--;
+        }
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode     )&0x01 );
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 1)&0x01 );
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 2)&0x01 );
+    }
+}
+static void x264_cabac_mb_intra8x8_pred_mode( x264_t *h )
+{
+    const int i_mode  = h->mb.i_chroma_pred_mode;
+    int       ctx = 0;
+
+    /* No need to test for I4x4 or I_16x16 as cache_save handle that */
+    if( h->mb.i_mb_x > 0 && h->mb.chroma_pred_mode[h->mb.i_mb_xy - 1] != 0 )
+    {
+        ctx++;
+    }
+    if( h->mb.i_mb_y > 0 && h->mb.chroma_pred_mode[h->mb.i_mb_xy - h->mb.i_mb_stride] != 0 )
+    {
+        ctx++;
+    }
+
+    if( i_mode == 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 64 + ctx, 0 );
+    }
+    else
+    {
+        x264_cabac_encode_decision( &h->cabac, 64 + ctx, 1 );
+        x264_cabac_encode_decision( &h->cabac, 64 + 3, ( i_mode == 1 ? 0 : 1 ) );
+        if( i_mode > 1 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 64 + 3, ( i_mode == 2 ? 0 : 1 ) );
+        }
+    }
+}
+
+static void x264_cabac_mb_cbp_luma( x264_t *h )
+{
+    /* TODO: clean up and optimize */
+    int i8x8;
+    for( i8x8 = 0; i8x8 < 4; i8x8++ )
+    {
+        int i_mba_xy = -1;
+        int i_mbb_xy = -1;
+        int x = block_idx_x[4*i8x8];
+        int y = block_idx_y[4*i8x8];
+        int ctx = 0;
+
+        if( x > 0 )
+            i_mba_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_x > 0 )
+            i_mba_xy = h->mb.i_mb_xy - 1;
+
+        if( y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+
+
+        /* No need to test for PCM and SKIP */
+        if( i_mba_xy >= 0 )
+        {
+            const int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+            if( ((h->mb.cbp[i_mba_xy] >> i8x8a)&0x01) == 0 )
+            {
+                ctx++;
+            }
+        }
+
+        if( i_mbb_xy >= 0 )
+        {
+            const int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+            if( ((h->mb.cbp[i_mbb_xy] >> i8x8b)&0x01) == 0 )
+            {
+                ctx += 2;
+            }
+        }
+
+        x264_cabac_encode_decision( &h->cabac, 73 + ctx, (h->mb.i_cbp_luma >> i8x8)&0x01 );
+    }
+}
+
+static void x264_cabac_mb_cbp_chroma( x264_t *h )
+{
+    int cbp_a = -1;
+    int cbp_b = -1;
+    int ctx;
+
+    /* No need to test for SKIP/PCM */
+    if( h->mb.i_mb_x > 0 )
+    {
+        cbp_a = (h->mb.cbp[h->mb.i_mb_xy - 1] >> 4)&0x3;
+    }
+
+    if( h->mb.i_mb_y > 0 )
+    {
+        cbp_b = (h->mb.cbp[h->mb.i_mb_xy - h->mb.i_mb_stride] >> 4)&0x3;
+    }
+
+    ctx = 0;
+    if( cbp_a > 0 ) ctx++;
+    if( cbp_b > 0 ) ctx += 2;
+    if( h->mb.i_cbp_chroma == 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 77 + ctx, 0 );
+    }
+    else
+    {
+        x264_cabac_encode_decision( &h->cabac, 77 + ctx, 1 );
+
+        ctx = 4;
+        if( cbp_a == 2 ) ctx++;
+        if( cbp_b == 2 ) ctx += 2;
+        x264_cabac_encode_decision( &h->cabac, 77 + ctx, h->mb.i_cbp_chroma > 1 ? 1 : 0 );
+    }
+}
+
+/* TODO check it with != qp per mb */
+static void x264_cabac_mb_qp_delta( x264_t *h )
+{
+    int i_mbn_xy = h->mb.i_mb_xy - 1;
+    int i_dqp = h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp;
+    int val = i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp - 1);
+    int ctx;
+
+    /* No need to test for PCM / SKIP */
+    if( i_mbn_xy >= 0 && h->mb.i_last_dqp != 0 &&
+        ( h->mb.type[i_mbn_xy] == I_16x16 || (h->mb.cbp[i_mbn_xy]&0x3f) ) )
+        ctx = 1;
+    else
+        ctx = 0;
+
+    while( val > 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac,  60 + ctx, 1 );
+        if( ctx < 2 )
+            ctx = 2;
+        else
+            ctx = 3;
+        val--;
+    }
+    x264_cabac_encode_decision( &h->cabac,  60 + ctx, 0 );
+}
+
+void x264_cabac_mb_skip( x264_t *h, int b_skip )
+{
+    int ctx = 0;
+
+    if( h->mb.i_mb_x > 0 && !IS_SKIP( h->mb.type[h->mb.i_mb_xy -1]) )
+    {
+        ctx++;
+    }
+    if( h->mb.i_mb_y > 0 && !IS_SKIP( h->mb.type[h->mb.i_mb_xy -h->mb.i_mb_stride]) )
+    {
+        ctx++;
+    }
+
+    if( h->sh.i_type == SLICE_TYPE_P )
+        x264_cabac_encode_decision( &h->cabac, 11 + ctx, b_skip ? 1 : 0 );
+    else /* SLICE_TYPE_B */
+        x264_cabac_encode_decision( &h->cabac, 24 + ctx, b_skip ? 1 : 0 );
+}
+
+static inline void x264_cabac_mb_sub_p_partition( x264_t *h, int i_sub )
+{
+    if( i_sub == D_L0_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 21, 1 );
+    }
+    else if( i_sub == D_L0_8x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 21, 0 );
+        x264_cabac_encode_decision( &h->cabac, 22, 0 );
+    }
+    else if( i_sub == D_L0_4x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 21, 0 );
+        x264_cabac_encode_decision( &h->cabac, 22, 1 );
+        x264_cabac_encode_decision( &h->cabac, 23, 1 );
+    }
+    else if( i_sub == D_L0_4x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 21, 0 );
+        x264_cabac_encode_decision( &h->cabac, 22, 1 );
+        x264_cabac_encode_decision( &h->cabac, 23, 0 );
+    }
+}
+
+static inline void x264_cabac_mb_sub_b_partition( x264_t *h, int i_sub )
+{
+    if( i_sub == D_DIRECT_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 0 );
+    }
+    else if( i_sub == D_L0_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 0 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+    }
+    else if( i_sub == D_L1_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 0 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+    }
+    else if( i_sub == D_BI_8x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_L0_8x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+    else if( i_sub == D_L0_4x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_L1_8x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+    else if( i_sub == D_L1_4x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_BI_8x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+    else if( i_sub == D_BI_4x8 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_L0_4x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+    else if( i_sub == D_L1_4x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 0 );
+    }
+    else if( i_sub == D_BI_4x4 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 36, 1 );
+        x264_cabac_encode_decision( &h->cabac, 37, 1 );
+        x264_cabac_encode_decision( &h->cabac, 38, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+        x264_cabac_encode_decision( &h->cabac, 39, 1 );
+    }
+}
+
+static inline void x264_cabac_mb_ref( x264_t *h, int i_list, int idx )
+{
+    const int i8 = x264_scan8[idx];
+    const int i_refa = h->mb.cache.ref[i_list][i8 - 1];
+    const int i_refb = h->mb.cache.ref[i_list][i8 - 8];
+    int i_ref  = h->mb.cache.ref[i_list][i8];
+    int ctx  = 0;
+
+    if( i_refa > 0 )
+        ctx++;
+    if( i_refb > 0 )
+        ctx += 2;
+
+    while( i_ref > 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 54 + ctx, 1 );
+        if( ctx < 4 )
+            ctx = 4;
+        else
+            ctx = 5;
+
+        i_ref--;
+    }
+    x264_cabac_encode_decision( &h->cabac, 54 + ctx, 0 );
+}
+
+
+
+static inline void  x264_cabac_mb_mvd_cpn( x264_t *h, int i_list, int idx, int l, int mvd )
+{
+    const int amvd = abs( h->mb.cache.mvd[i_list][x264_scan8[idx] - 1][l] ) +
+                     abs( h->mb.cache.mvd[i_list][x264_scan8[idx] - 8][l] );
+    const int i_abs = abs( mvd );
+    const int i_prefix = X264_MIN( i_abs, 9 );
+    const int ctxbase = (l == 0 ? 40 : 47);
+    int ctx;
+    int i;
+
+
+    if( amvd < 3 )
+        ctx = 0;
+    else if( amvd > 32 )
+        ctx = 2;
+    else
+        ctx = 1;
+
+    for( i = 0; i < i_prefix; i++ )
+    {
+        x264_cabac_encode_decision( &h->cabac, ctxbase + ctx, 1 );
+        if( ctx < 3 )
+            ctx = 3;
+        else if( ctx < 6 )
+            ctx++;
+    }
+    if( i_prefix < 9 )
+    {
+        x264_cabac_encode_decision( &h->cabac, ctxbase + ctx, 0 );
+    }
+
+    if( i_prefix >= 9 )
+    {
+        int i_suffix = i_abs - 9;
+        int k = 3;
+
+        while( i_suffix >= (1<<k) )
+        {
+            x264_cabac_encode_bypass( &h->cabac, 1 );
+            i_suffix -= 1 << k;
+            k++;
+        }
+        x264_cabac_encode_bypass( &h->cabac, 0 );
+        while( k-- )
+        {
+            x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+        }
+    }
+
+    /* sign */
+    if( mvd > 0 )
+        x264_cabac_encode_bypass( &h->cabac, 0 );
+    else if( mvd < 0 )
+        x264_cabac_encode_bypass( &h->cabac, 1 );
+}
+
+static inline void  x264_cabac_mb_mvd( x264_t *h, int i_list, int idx, int width, int height )
+{
+    int mvp[2];
+    int mdx, mdy;
+
+    /* Calculate mvd */
+    x264_mb_predict_mv( h, i_list, idx, width, mvp );
+    mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
+    mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
+
+    /* encode */
+    x264_cabac_mb_mvd_cpn( h, i_list, idx, 0, mdx );
+    x264_cabac_mb_mvd_cpn( h, i_list, idx, 1, mdy );
+
+    /* save value */
+    x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mdx, mdy );
+}
+
+static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
+{
+    /* TODO: clean up/optimize */
+    int i_mba_xy = -1;
+    int i_mbb_xy = -1;
+    int i_nza = -1;
+    int i_nzb = -1;
+    int ctx = 0;
+
+    if( i_cat == 0 )
+    {
+        if( h->mb.i_mb_x > 0 )
+        {
+            i_mba_xy = h->mb.i_mb_xy -1;
+            if( h->mb.type[i_mba_xy] == I_16x16 )
+            {
+                i_nza = h->mb.cbp[i_mba_xy]&0x100;
+            }
+        }
+        if( h->mb.i_mb_y > 0 )
+        {
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+            if( h->mb.type[i_mbb_xy] == I_16x16 )
+            {
+                i_nzb = h->mb.cbp[i_mbb_xy]&0x100;
+            }
+        }
+    }
+    else if( i_cat == 1 || i_cat == 2 )
+    {
+        int x = block_idx_x[i_idx];
+        int y = block_idx_y[i_idx];
+
+        if( x > 0 )
+            i_mba_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_x > 0 )
+            i_mba_xy = h->mb.i_mb_xy -1;
+
+        if( y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+
+        /* no need to test for skip/pcm */
+        if( i_mba_xy >= 0 )
+        {
+            const int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+            if( (h->mb.cbp[i_mba_xy]&0x0f)>> i8x8a )
+            {
+                i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
+            }
+        }
+        if( i_mbb_xy >= 0 )
+        {
+            const int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+            if( (h->mb.cbp[i_mbb_xy]&0x0f)>> i8x8b )
+            {
+                i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
+            }
+        }
+    }
+    else if( i_cat == 3 )
+    {
+        /* no need to test skip/pcm */
+        if( h->mb.i_mb_x > 0 )
+        {
+            i_mba_xy = h->mb.i_mb_xy -1;
+            if( h->mb.cbp[i_mba_xy]&0x30 )
+            {
+                i_nza = h->mb.cbp[i_mba_xy]&( 0x02 << ( 8 + i_idx) );
+            }
+        }
+        if( h->mb.i_mb_y > 0 )
+        {
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+            if( h->mb.cbp[i_mbb_xy]&0x30 )
+            {
+                i_nzb = h->mb.cbp[i_mbb_xy]&( 0x02 << ( 8 + i_idx) );
+            }
+        }
+    }
+    else if( i_cat == 4 )
+    {
+        int idxc = i_idx% 4;
+
+        if( idxc == 1 || idxc == 3 )
+            i_mba_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_x > 0 )
+            i_mba_xy = h->mb.i_mb_xy - 1;
+
+        if( idxc == 2 || idxc == 3 )
+            i_mbb_xy = h->mb.i_mb_xy;
+        else if( h->mb.i_mb_y > 0 )
+            i_mbb_xy = h->mb.i_mb_xy - h->mb.i_mb_stride;
+
+        /* no need to test skip/pcm */
+        if( i_mba_xy >= 0 && (h->mb.cbp[i_mba_xy]&0x30) == 0x20 )
+        {
+            i_nza = h->mb.cache.non_zero_count[x264_scan8[16+i_idx] - 1];
+        }
+        if( i_mbb_xy >= 0 && (h->mb.cbp[i_mbb_xy]&0x30) == 0x20 )
+        {
+            i_nzb = h->mb.cache.non_zero_count[x264_scan8[16+i_idx] - 8];
+        }
+    }
+
+    if( ( i_mba_xy < 0  && IS_INTRA( h->mb.i_type ) ) || i_nza > 0 )
+    {
+        ctx++;
+    }
+    if( ( i_mbb_xy < 0  && IS_INTRA( h->mb.i_type ) ) || i_nzb > 0 )
+    {
+        ctx += 2;
+    }
+
+    return 4 * i_cat + ctx;
+}
+
+
+static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx, int *l, int i_count )
+{
+    static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 };
+
+    int i_coeff_abs_m1[16];
+    int i_coeff_sign[16];
+    int i_coeff = 0;
+    int i_last  = 0;
+
+    int i_abslevel1 = 0;
+    int i_abslevelgt1 = 0;
+
+    int i;
+
+    /* i_ctxBlockCat: 0-> DC 16x16  i_idx = 0
+     *                1-> AC 16x16  i_idx = luma4x4idx
+     *                2-> Luma4x4   i_idx = luma4x4idx
+     *                3-> DC Chroma i_idx = iCbCr
+     *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+     */
+
+    //fprintf( stderr, "l[] = " );
+    for( i = 0; i < i_count; i++ )
+    {
+        //fprintf( stderr, "%d ", l[i] );
+        if( l[i] != 0 )
+        {
+            i_coeff_abs_m1[i_coeff] = abs( l[i] ) - 1;
+            i_coeff_sign[i_coeff]   = ( l[i] < 0 ? 1 : 0);
+            i_coeff++;
+
+            i_last = i;
+        }
+    }
+    //fprintf( stderr, "\n" );
+
+    if( i_coeff == 0 )
+    {
+        /* codec block flag */
+        x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), 0 );
+        return;
+    }
+
+    /* block coded */
+    x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), 1 );
+    for( i = 0; i < i_count - 1; i++ )
+    {
+        int i_ctxIdxInc;
+
+        i_ctxIdxInc = X264_MIN( i, i_count - 2 );
+
+        if( l[i] != 0 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 1 );
+            x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, i == i_last ? 1 : 0 );
+        }
+        else
+        {
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 0 );
+        }
+        if( i == i_last )
+        {
+            break;
+        }
+    }
+
+    for( i = i_coeff - 1; i >= 0; i-- )
+    {
+        int i_prefix;
+        int i_ctxIdxInc;
+
+        /* write coeff_abs - 1 */
+
+        /* prefix */
+        i_prefix = X264_MIN( i_coeff_abs_m1[i], 14 );
+
+        i_ctxIdxInc = (i_abslevelgt1 != 0 ? 0 : X264_MIN( 4, i_abslevel1 + 1 )) + coeff_abs_level_m1_offset[i_ctxBlockCat];
+        if( i_prefix == 0 )
+        {
+            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+        }
+        else
+        {
+            int j;
+            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            i_ctxIdxInc = 5 + X264_MIN( 4, i_abslevelgt1 ) + coeff_abs_level_m1_offset[i_ctxBlockCat];
+            for( j = 0; j < i_prefix - 1; j++ )
+            {
+                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            }
+            if( i_prefix < 14 )
+            {
+                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+            }
+        }
+        /* suffix */
+        if( i_coeff_abs_m1[i] >= 14 )
+        {
+            int k = 0;
+            int i_suffix = i_coeff_abs_m1[i] - 14;
+
+            while( i_suffix >= (1<<k) )
+            {
+                x264_cabac_encode_bypass( &h->cabac, 1 );
+                i_suffix -= 1 << k;
+                k++;
+            }
+            x264_cabac_encode_bypass( &h->cabac, 0 );
+            while( k-- )
+            {
+                x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+            }
+        }
+
+        /* write sign */
+        x264_cabac_encode_bypass( &h->cabac, i_coeff_sign[i] );
+
+
+        if( i_coeff_abs_m1[i] == 0 )
+        {
+            i_abslevel1++;
+        }
+        else
+        {
+            i_abslevelgt1++;
+        }
+    }
+}
+
+
+
+void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
+{
+    const int i_mb_type = h->mb.i_type;
+    int i;
+
+    /* Write the MB type */
+    x264_cabac_mb_type( h );
+
+    /* PCM special block type UNTESTED */
+    if( i_mb_type == I_PCM )
+    {
+        bs_align_0( s );    /* not sure */
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            const int x = 16 * h->mb.i_mb_x + (i % 16);
+            const int y = 16 * h->mb.i_mb_y + (i / 16);
+            bs_write( s, 8, h->fenc->plane[0][y*h->mb.pic.i_stride[0]+x] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[1][y*h->mb.pic.i_stride[1]+x] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[2][y*h->mb.pic.i_stride[2]+x] );
+        }
+        x264_cabac_encode_init( &h->cabac, s );
+        return;
+    }
+
+    if( IS_INTRA( i_mb_type ) )
+    {
+        /* Prediction */
+        if( i_mb_type == I_4x4 )
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
+                const int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+                x264_cabac_mb_intra4x4_pred_mode( h, i_pred, i_mode );
+            }
+        }
+        x264_cabac_mb_intra8x8_pred_mode( h );
+    }
+    else if( i_mb_type == P_L0 )
+    {
+        if( h->mb.i_partition == D_16x16 )
+        {
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                x264_cabac_mb_ref( h, 0, 0 );
+            }
+            x264_cabac_mb_mvd( h, 0, 0, 4, 4 );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                x264_cabac_mb_ref( h, 0, 0 );
+                x264_cabac_mb_ref( h, 0, 8 );
+            }
+            x264_cabac_mb_mvd( h, 0, 0, 4, 2 );
+            x264_cabac_mb_mvd( h, 0, 8, 4, 2 );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                x264_cabac_mb_ref( h, 0, 0 );
+                x264_cabac_mb_ref( h, 0, 4 );
+            }
+            x264_cabac_mb_mvd( h, 0, 0, 2, 4 );
+            x264_cabac_mb_mvd( h, 0, 4, 2, 4 );
+        }
+    }
+    else if( i_mb_type == P_8x8 )
+    {
+        /* sub mb type */
+        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[0] );
+        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[1] );
+        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[2] );
+        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[3] );
+
+        /* ref 0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            x264_cabac_mb_ref( h, 0, 0 );
+            x264_cabac_mb_ref( h, 0, 4 );
+            x264_cabac_mb_ref( h, 0, 8 );
+            x264_cabac_mb_ref( h, 0, 12 );
+        }
+
+        for( i = 0; i < 4; i++ )
+        {
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    x264_cabac_mb_mvd( h, 0, 4*i, 2, 2 );
+                    break;
+                case D_L0_8x4:
+                    x264_cabac_mb_mvd( h, 0, 4*i+0, 2, 1 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+2, 2, 1 );
+                    break;
+                case D_L0_4x8:
+                    x264_cabac_mb_mvd( h, 0, 4*i+0, 1, 2 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+1, 1, 2 );
+                    break;
+                case D_L0_4x4:
+                    x264_cabac_mb_mvd( h, 0, 4*i+0, 1, 1 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+1, 1, 1 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+2, 1, 1 );
+                    x264_cabac_mb_mvd( h, 0, 4*i+3, 1, 1 );
+                    break;
+            }
+        }
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        /* TODO */
+        fprintf( stderr, "Arggg B_8x8\n" );
+        return;
+    }
+    else if( i_mb_type != B_DIRECT )
+    {
+        /* All B mode */
+        int i_list;
+        int b_list[2][2];
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i];
+            b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
+        }
+
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            const int i_ref_max = i_list == 0 ? h->sh.i_num_ref_idx_l0_active : h->sh.i_num_ref_idx_l1_active;
+
+            if( i_ref_max > 1 )
+            {
+                if( h->mb.i_partition == D_16x16 )
+                {
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
+                }
+                else if( h->mb.i_partition == D_16x8 )
+                {
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
+                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, i_list, 8 );
+                }
+                else if( h->mb.i_partition == D_8x16 )
+                {
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
+                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, i_list, 4 );
+                }
+            }
+        }
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            if( h->mb.i_partition == D_16x16 )
+            {
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 4, 4 );
+            }
+            else if( h->mb.i_partition == D_16x8 )
+            {
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 4, 2 );
+                if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, i_list, 8, 4, 2 );
+            }
+            else if( h->mb.i_partition == D_8x16 )
+            {
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 2, 4 );
+                if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, i_list, 4, 2, 4 );
+            }
+        }
+    }
+
+    if( i_mb_type != I_16x16 )
+    {
+        x264_cabac_mb_cbp_luma( h );
+        x264_cabac_mb_cbp_chroma( h );
+    }
+
+    if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
+    {
+        x264_cabac_mb_qp_delta( h );
+
+        /* write residual */
+        if( i_mb_type == I_16x16 )
+        {
+            /* DC Luma */
+            block_residual_write_cabac( h, 0, 0, h->dct.luma16x16_dc, 16 );
+
+            if( h->mb.i_cbp_luma != 0 )
+            {
+                /* AC Luma */
+                for( i = 0; i < 16; i++ )
+                {
+                    block_residual_write_cabac( h, 1, i, h->dct.block[i].residual_ac, 15 );
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
+                {
+                    block_residual_write_cabac( h, 2, i, h->dct.block[i].luma4x4, 16 );
+                }
+            }
+        }
+
+        if( h->mb.i_cbp_chroma &0x03 )    /* Chroma DC residual present */
+        {
+            block_residual_write_cabac( h, 3, 0, h->dct.chroma_dc[0], 4 );
+            block_residual_write_cabac( h, 3, 1, h->dct.chroma_dc[1], 4 );
+        }
+        if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cabac( h, 4, i, h->dct.block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
+
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
new file mode 100644
index 00000000..de7ba547
--- /dev/null
+++ b/encoder/cavlc.c
@@ -0,0 +1,688 @@
+/*****************************************************************************
+ * cavlc.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cavlc.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "macroblock.h"
+
+static const uint8_t intra4x4_cbp_to_golomb[48]=
+{
+  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+ 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+ 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
+};
+static const uint8_t inter_cbp_to_golomb[48]=
+{
+  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+};
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+static inline void bs_write_vlc( bs_t *s, vlc_t v )
+{
+    bs_write( s, v.i_size, v.i_bits );
+}
+
+/****************************************************************************
+ * block_residual_write_cavlc:
+ ****************************************************************************/
+static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int *l, int i_count )
+{
+    int level[16], run[16];
+    int i_total, i_trailing;
+    int i_total_zero;
+    int i_last;
+    unsigned int i_sign;
+
+    int i;
+    int i_zero_left;
+    int i_suffix_length;
+
+    /* first find i_last */
+    i_last = i_count - 1;
+    while( i_last >= 0 && l[i_last] == 0 )
+    {
+        i_last--;
+    }
+
+    i_sign = 0;
+    i_total = 0;
+    i_trailing = 0;
+    i_total_zero = 0;
+
+    if( i_last >= 0 )
+    {
+        int b_trailing = 1;
+        int idx = 0;
+
+        /* level and run and total */
+        while( i_last >= 0 )
+        {
+            level[idx] = l[i_last--];
+
+            run[idx] = 0;
+            while( i_last >= 0 && l[i_last] == 0 )
+            {
+                run[idx]++;
+                i_last--;
+            }
+
+            i_total++;
+            i_total_zero += run[idx];
+
+            if( b_trailing && abs( level[idx] ) == 1 && i_trailing < 3 )
+            {
+                i_sign <<= 1;
+                if( level[idx] < 0 )
+                {
+                    i_sign |= 0x01;
+                }
+
+                i_trailing++;
+            }
+            else
+            {
+                b_trailing = 0;
+            }
+
+            idx++;
+        }
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
+    }
+    else
+    {
+        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = x264_mb_predict_non_zero_code( h, 0 );
+        }
+        else
+        {
+            nC = x264_mb_predict_non_zero_code( h, i_idx );
+        }
+
+        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total*4+i_trailing] );
+    }
+
+    if( i_total <= 0 )
+    {
+        return;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+    if( i_trailing > 0 )
+    {
+        bs_write( s, i_trailing, i_sign );
+    }
+    for( i = i_trailing; i < i_total; i++ )
+    {
+        int i_level_code;
+
+        /* calculate level code */
+        if( level[i] < 0 )
+        {
+            i_level_code = -2*level[i] - 1;
+        }
+        else /* if( level[i] > 0 ) */
+        {
+            i_level_code = 2 * level[i] - 2;
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code -=2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        }
+
+        if( ( i_level_code >> i_suffix_length ) < 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[i_level_code >> i_suffix_length] );
+            if( i_suffix_length > 0 )
+            {
+                bs_write( s, i_suffix_length, i_level_code );
+            }
+        }
+        else if( i_suffix_length == 0 && i_level_code < 30 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, 4, i_level_code - 14 );
+        }
+        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, i_suffix_length, i_level_code );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_level_prefix[15] );
+            i_level_code -= 15 << i_suffix_length;
+            if( i_suffix_length == 0 )
+            {
+                i_level_code -= 15;
+            }
+
+            if( i_level_code >= ( 1 << 12 ) || i_level_code < 0 )
+            {
+                fprintf( stderr, "OVERFLOW levelcode=%d\n", i_level_code );
+            }
+
+            bs_write( s, 12, i_level_code );    /* check overflow ?? */
+        }
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        }
+    }
+
+    for( i = 0, i_zero_left = i_total_zero; i < i_total - 1; i++ )
+    {
+        int i_zl;
+
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+
+        i_zl = X264_MIN( i_zero_left - 1, 6 );
+
+        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
+
+        i_zero_left -= run[i];
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_write:
+ *****************************************************************************/
+void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
+{
+    const int i_mb_type = h->mb.i_type;
+    int i_mb_i_offset;
+    int i;
+
+    switch( h->sh.i_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_i_offset = 23;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return;
+    }
+
+    /* Write:
+      - type
+      - prediction
+      - mv */
+    if( i_mb_type == I_PCM )
+    {
+        /* Untested */
+        bs_write_ue( s, i_mb_i_offset + 25 );
+
+        bs_align_0( s );
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            const int x = 16 * h->mb.i_mb_x + (i % 16);
+            const int y = 16 * h->mb.i_mb_y + (i / 16);
+            bs_write( s, 8, h->fenc->plane[0][y*h->mb.pic.i_stride[0]+x] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[1][y*h->mb.pic.i_stride[1]+x] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[2][y*h->mb.pic.i_stride[2]+x] );
+        }
+        return;
+    }
+    else if( i_mb_type == I_4x4 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 0 );
+
+        /* Prediction: Luma */
+        for( i = 0; i < 16; i++ )
+        {
+            int i_pred = x264_mb_predict_intra4x4_mode( h, i );
+            int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+
+            if( i_pred == i_mode)
+            {
+                bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
+            }
+            else
+            {
+                bs_write1( s, 0 );  /* b_prev_intra4x4_pred_mode */
+                if( i_mode < i_pred )
+                {
+                    bs_write( s, 3, i_mode );
+                }
+                else
+                {
+                    bs_write( s, 3, i_mode - 1 );
+                }
+            }
+        }
+        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+    }
+    else if( i_mb_type == I_16x16 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 1 + h->mb.i_intra16x16_pred_mode +
+                        h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
+        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+    }
+    else if( i_mb_type == P_L0 )
+    {
+        int mvp[2];
+
+        if( h->mb.i_partition == D_16x16 )
+        {
+            bs_write_ue( s, 0 );
+
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            }
+            x264_mb_predict_mv( h, 0, 0, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            bs_write_ue( s, 1 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            }
+
+            x264_mb_predict_mv( h, 0, 0, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+
+            x264_mb_predict_mv( h, 0, 8, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][1] - mvp[1] );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            bs_write_ue( s, 2 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            }
+
+            x264_mb_predict_mv( h, 0, 0, 2, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+
+            x264_mb_predict_mv( h, 0, 4, 2, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][1] - mvp[1] );
+        }
+    }
+    else if( i_mb_type == P_8x8 )
+    {
+        int b_sub_ref0;
+
+        if( h->mb.cache.ref[0][x264_scan8[0]] == 0 && h->mb.cache.ref[0][x264_scan8[4]] == 0 &&
+            h->mb.cache.ref[0][x264_scan8[8]] == 0 && h->mb.cache.ref[0][x264_scan8[12]] == 0 )
+        {
+            bs_write_ue( s, 4 );
+            b_sub_ref0 = 0;
+        }
+        else
+        {
+            bs_write_ue( s, 3 );
+            b_sub_ref0 = 1;
+        }
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    bs_write_ue( s, 0 );
+                    break;
+                case D_L0_8x4:
+                    bs_write_ue( s, 1 );
+                    break;
+                case D_L0_4x8:
+                    bs_write_ue( s, 2 );
+                    break;
+                case D_L0_4x4:
+                    bs_write_ue( s, 3 );
+                    break;
+            }
+        }
+        /* ref0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
+        {
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[12]] );
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int mvp[2];
+
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    x264_mb_predict_mv( h, 0, 4*i, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+                    break;
+                case D_L0_8x4:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+2, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][1] - mvp[1] );
+                    break;
+                case D_L0_4x8:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+1, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][1] - mvp[1] );
+                    break;
+                case D_L0_4x4:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+1, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+2, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+3, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+3]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+3]][1] - mvp[1] );
+                    break;
+            }
+        }
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        fprintf( stderr, "invalid/unhandled mb_type (B_8x8)\n" );
+        return;
+    }
+    else if( i_mb_type != B_DIRECT )
+    {
+        /* All B mode */
+        /* Motion Vector */
+        int i_list;
+        int mvp[2];
+
+        int b_list[2][2];
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i];
+            b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
+        }
+
+
+        if( h->mb.i_partition == D_16x16 )
+        {
+            if( b_list[0][0] && b_list[1][0] )
+            {
+                bs_write_ue( s, 3 );
+            }
+            else if( b_list[1][0] )
+            {
+                bs_write_ue( s, 2 );
+            }
+            else
+            {
+                bs_write_ue( s, 1 );
+            }
+        }
+        else
+        {
+            if( i_mb_type == B_BI_BI )
+            {
+                bs_write_ue( s, 20 + (h->mb.i_partition == D_16x8 ? 0 : 1 ) );
+            }
+            else if( b_list[0][0] && b_list[1][0] )
+            {
+                /* B_BI_LX* */
+                bs_write_ue( s, 16 + (b_list[0][1]?0:2) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[0][1] && b_list[1][1] )
+            {
+                /* B_LX_BI */
+                bs_write_ue( s, 12 + (b_list[0][1]?0:2) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[1][1] )
+            {
+                /* B_LX_L1 */
+                bs_write_ue( s, 6 + (b_list[0][0]?2:0) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[0][1] )
+            {
+                /* B_LX_L0 */
+                bs_write_ue( s, 4 + (b_list[0][0]?0:6) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+        }
+
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            const int i_ref_max = i_list == 0 ? h->sh.i_num_ref_idx_l0_active : h->sh.i_num_ref_idx_l1_active;
+
+            if( i_ref_max > 1 )
+            {
+                switch( h->mb.i_partition )
+                {
+                    case D_16x16:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        break;
+                    case D_16x8:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[8]] );
+                        break;
+                    case D_8x16:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[4]] );
+                        break;
+                }
+            }
+        }
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    break;
+                case D_16x8:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    if( b_list[i_list][1] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 8, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][1] - mvp[1] );
+                    }
+                    break;
+                case D_8x16:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 2, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    if( b_list[i_list][1] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 4, 2, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][1] - mvp[1] );
+                    }
+                    break;
+            }
+        }
+    }
+    else if( i_mb_type == B_DIRECT )
+    {
+        bs_write_ue( s, 0 );
+    }
+    else
+    {
+        fprintf( stderr, "invalid/unhandled mb_type\n" );
+        return;
+    }
+
+    /* Coded block patern */
+    if( i_mb_type == I_4x4 )
+    {
+        bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    }
+    else if( i_mb_type != I_16x16 )
+    {
+        bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    }
+
+    /* write residual */
+    if( i_mb_type == I_16x16 )
+    {
+        bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
+
+        /* DC Luma */
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 );
+
+        if( h->mb.i_cbp_luma != 0 )
+        {
+            /* AC Luma */
+            for( i = 0; i < 16; i++ )
+            {
+                block_residual_write_cavlc( h, s, i, h->dct.block[i].residual_ac, 15 );
+            }
+        }
+    }
+    else if( h->mb.i_cbp_luma != 0 || h->mb.i_cbp_chroma != 0 )
+    {
+        bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
+
+        for( i = 0; i < 16; i++ )
+        {
+            if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
+            {
+                block_residual_write_cavlc( h, s, i, h->dct.block[i].luma4x4, 16 );
+            }
+        }
+    }
+    if( h->mb.i_cbp_chroma != 0 )
+    {
+        /* Chroma DC residual present */
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
+        if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cavlc( h, s, 16 + i, h->dct.block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
diff --git a/encoder/encoder.c b/encoder/encoder.c
new file mode 100644
index 00000000..73ca380b
--- /dev/null
+++ b/encoder/encoder.c
@@ -0,0 +1,1235 @@
+/*****************************************************************************
+ * x264: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: encoder.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <math.h>
+
+#include "../core/common.h"
+#include "../core/cpu.h"
+
+#include "set.h"
+#include "analyse.h"
+#include "ratecontrol.h"
+#include "macroblock.h"
+
+//#define DEBUG_MB_TYPE
+//#define DEBUG_DUMP_FRAME 1
+
+static int64_t i_mtime_encode_frame = 0;
+
+static int64_t i_mtime_analyse = 0;
+static int64_t i_mtime_encode = 0;
+static int64_t i_mtime_write = 0;
+static int64_t i_mtime_filter = 0;
+#define TIMER_START( d ) \
+    { \
+        int64_t d##start = x264_mdate();
+
+#define TIMER_STOP( d ) \
+        d += x264_mdate() - d##start;\
+    }
+
+
+/****************************************************************************
+ *
+ ******************************* x264 libs **********************************
+ *
+ ****************************************************************************/
+
+
+static float x264_psnr( uint8_t *pix1, int i_pix_stride, uint8_t *pix2, int i_pix2_stride, int i_width, int i_height )
+{
+    int64_t i_sqe = 0;
+
+    int x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            int tmp;
+
+            tmp = pix1[y*i_pix_stride+x] - pix2[y*i_pix2_stride+x];
+
+            i_sqe += tmp * tmp;
+        }
+    }
+
+    if( i_sqe == 0 )
+    {
+        return -1.0;
+    }
+    return (float)(10.0 * log( (double)65025.0 * (double)i_height * (double)i_width / (double)i_sqe ) / log( 10.0 ));
+}
+
+#if DEBUG_DUMP_FRAME
+static void x264_frame_dump( x264_t *h, x264_frame_t *fr, char *name )
+{
+    FILE * f = fopen( name, "a" );
+    int i, y;
+
+    fseek( f, 0, SEEK_END );
+
+    for( i = 0; i < fr->i_plane; i++ )
+    {
+        for( y = 0; y < h->param.i_height / ( i == 0 ? 1 : 2 ); y++ )
+        {
+            fwrite( &fr->plane[i][y*fr->i_stride[i]], 1, h->param.i_width / ( i == 0 ? 1 : 2 ), f );
+        }
+    }
+    fclose( f );
+}
+#endif
+
+
+/* Fill "default" values */
+static void x264_slice_header_init( x264_slice_header_t *sh, x264_param_t *param,
+                                    x264_sps_t *sps, x264_pps_t *pps,
+                                    int i_type, int i_idr_pic_id, int i_frame )
+{
+    /* First we fill all field */
+    sh->sps = sps;
+    sh->pps = pps;
+
+    sh->i_type      = i_type;
+    sh->i_first_mb  = 0;
+    sh->i_pps_id    = pps->i_id;
+
+    sh->i_frame_num = i_frame;
+
+    sh->b_field_pic = 0;    /* Not field support for now */
+    sh->b_bottom_field = 1; /* not yet used */
+
+    sh->i_idr_pic_id = i_idr_pic_id;
+
+    /* poc stuff, fixed later */
+    sh->i_poc_lsb = 0;
+    sh->i_delta_poc_bottom = 0;
+    sh->i_delta_poc[0] = 0;
+    sh->i_delta_poc[1] = 0;
+
+    sh->i_redundant_pic_cnt = 0;
+
+    sh->b_direct_spatial_mv_pred = 1;
+
+    sh->b_num_ref_idx_override = 0;
+    sh->i_num_ref_idx_l0_active = 1;
+    sh->i_num_ref_idx_l1_active = 1;
+
+    sh->i_cabac_init_idc = param->i_cabac_init_idc;
+
+    sh->i_qp_delta = 0;
+    sh->b_sp_for_swidth = 0;
+    sh->i_qs_delta = 0;
+
+    if( param->b_deblocking_filter )
+    {
+        sh->i_disable_deblocking_filter_idc = 0;
+    }
+    else
+    {
+        sh->i_disable_deblocking_filter_idc = 1;
+    }
+    sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
+    sh->i_beta_offset = param->i_deblocking_filter_beta << 1;
+}
+
+static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal_ref_idc )
+{
+    bs_write_ue( s, sh->i_first_mb );
+    bs_write_ue( s, sh->i_type + 5 );   /* same type things */
+    bs_write_ue( s, sh->i_pps_id );
+    bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num );
+
+    if( sh->i_idr_pic_id >= 0 ) /* NAL IDR */
+    {
+        bs_write_ue( s, sh->i_idr_pic_id );
+    }
+
+    if( sh->sps->i_poc_type == 0 )
+    {
+        bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc_lsb );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            bs_write_se( s, sh->i_delta_poc_bottom );
+        }
+    }
+    else if( sh->sps->i_poc_type == 1 && !sh->sps->b_delta_pic_order_always_zero )
+    {
+        bs_write_se( s, sh->i_delta_poc[0] );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            bs_write_se( s, sh->i_delta_poc[1] );
+        }
+    }
+
+    if( sh->pps->b_redundant_pic_cnt )
+    {
+        bs_write_ue( s, sh->i_redundant_pic_cnt );
+    }
+
+    if( sh->i_type == SLICE_TYPE_B )
+    {
+        bs_write1( s, sh->b_direct_spatial_mv_pred );
+    }
+    if( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP || sh->i_type == SLICE_TYPE_B )
+    {
+        bs_write1( s, sh->b_num_ref_idx_override );
+        if( sh->b_num_ref_idx_override )
+        {
+            bs_write_ue( s, sh->i_num_ref_idx_l0_active - 1 );
+            if( sh->i_type == SLICE_TYPE_B )
+            {
+                bs_write_ue( s, sh->i_num_ref_idx_l1_active - 1 );
+            }
+        }
+    }
+
+    /* ref pic list reordering */
+    if( sh->i_type != SLICE_TYPE_I )
+    {
+        int b_ref_pic_list_reordering_l0 = 0;
+        bs_write1( s, b_ref_pic_list_reordering_l0 );
+        if( b_ref_pic_list_reordering_l0 )
+        {
+            /* FIXME */
+        }
+    }
+    if( sh->i_type == SLICE_TYPE_B )
+    {
+        int b_ref_pic_list_reordering_l1 = 0;
+        bs_write1( s, b_ref_pic_list_reordering_l1 );
+        if( b_ref_pic_list_reordering_l1 )
+        {
+            /* FIXME */
+        }
+    }
+
+    if( ( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) ) ||
+        ( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) )
+    {
+        /* FIXME */
+    }
+
+    if( i_nal_ref_idc != 0 )
+    {
+        if( sh->i_idr_pic_id >= 0 )
+        {
+            bs_write1( s, 0 );  /* no output of prior pics flag */
+            bs_write1( s, 0 );  /* long term reference flag */
+        }
+        else
+        {
+            bs_write1( s, 0 );  /* adaptive_ref_pic_marking_mode_flag */
+            /* FIXME */
+        }
+    }
+
+    if( sh->pps->b_cabac && sh->i_type != SLICE_TYPE_I )
+    {
+        bs_write_ue( s, sh->i_cabac_init_idc );
+    }
+    bs_write_se( s, sh->i_qp_delta );      /* slice qp delta */
+#if 0
+    if( sh->i_type == SLICE_TYPE_SP || sh->i_type == SLICE_TYPE_SI )
+    {
+        if( sh->i_type == SLICE_TYPE_SP )
+        {
+            bs_write1( s, sh->b_sp_for_swidth );
+        }
+        bs_write_se( s, sh->i_qs_delta );
+    }
+#endif
+
+    if( sh->pps->b_deblocking_filter_control )
+    {
+        bs_write_ue( s, sh->i_disable_deblocking_filter_idc );
+        if( sh->i_disable_deblocking_filter_idc != 1 )
+        {
+            bs_write_se( s, sh->i_alpha_c0_offset >> 1 );
+            bs_write_se( s, sh->i_beta_offset >> 1 );
+        }
+    }
+}
+
+/****************************************************************************
+ *
+ ****************************************************************************
+ ****************************** External API*********************************
+ ****************************************************************************
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * x264_encoder_open:
+ ****************************************************************************/
+x264_t *x264_encoder_open   ( x264_param_t *param )
+{
+    x264_t *h = x264_malloc( sizeof( x264_t ) );
+    int i;
+
+    /* Check parameters validity */
+    if( param->i_width <= 0  || param->i_height <= 0 )
+    {
+        fprintf( stderr, "invalid width x height (%dx%d)\n",
+                 param->i_width, param->i_height );
+        free( h );
+        return NULL;
+    }
+
+    if( param->i_width % 16 != 0 || param->i_height % 16 != 0 )
+    {
+        fprintf( stderr, "width %% 16 != 0 pr height %% 16 != 0 (%dx%d)\n",
+                 param->i_width, param->i_height );
+        free( h );
+        return NULL;
+    }
+    if( param->i_csp != X264_CSP_I420 )
+    {
+        fprintf( stderr, "invalid CSP (only I420 supported)\n" );
+        free( h );
+        return NULL;
+    }
+
+    /* Fix parameters values */
+    memcpy( &h->param, param, sizeof( x264_param_t ) );
+    h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 15 );
+    if( h->param.i_idrframe <= 0 )
+    {
+        h->param.i_idrframe = 1;
+    }
+    if( h->param.i_iframe <= 0 )
+    {
+        h->param.i_iframe = 1;
+    }
+    h->param.i_bframe  = x264_clip3( h->param.i_bframe , 0, X264_BFRAME_MAX );
+#if 0
+    if( h->param.i_bframe > 0 && h->param.b_cabac )
+    {
+        fprintf( stderr, "cabac not supported with B frame (cabac disabled)\n" );
+        h->param.b_cabac = 0;
+    }
+#endif
+
+    h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 );
+    h->param.i_deblocking_filter_beta    = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
+
+    h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, -1, 2 );
+
+    /* Init x264_t */
+    h->out.i_nal = 0;
+    h->out.i_bitstream = 1000000; /* FIXME estimate max size (idth/height) */
+    h->out.p_bitstream = x264_malloc( h->out.i_bitstream );
+
+    h->i_frame = 0;
+    h->i_frame_num = 0;
+    h->i_poc   = 0;
+    h->i_idr_pic_id = 0;
+
+    h->sps = &h->sps_array[0];
+    x264_sps_init( h->sps, 0, &h->param );
+
+    h->pps = &h->pps_array[0];
+    x264_pps_init( h->pps, 0, &h->param, h->sps);
+
+    /* Init frames. */
+    for( i = 0; i < X264_BFRAME_MAX + 1; i++ )
+    {
+        h->frames.current[i] = NULL;
+        h->frames.next[i]    = NULL;
+        h->frames.unused[i]  = NULL;
+    }
+    for( i = 0; i < 1 + h->param.i_bframe; i++ )
+    {
+        h->frames.unused[i] =  x264_frame_new( h );
+    }
+    for( i = 0; i < 2 + h->param.i_frame_reference; i++ )
+    {
+        /* 2 = 1 backward ref  + 1 fdec */
+        h->frames.reference[i] = x264_frame_new( h );
+    }
+    h->frames.i_last_idr = h->param.i_idrframe;
+    h->frames.i_last_i   = h->param.i_iframe;
+
+    h->i_ref0 = 0;
+    h->i_ref1 = 0;
+
+    h->fdec = h->frames.reference[0];
+
+    /* init mb cache */
+    x264_macroblock_cache_init( h );
+
+    /* init cabac adaptive model */
+    x264_cabac_model_init( &h->cabac );
+
+    /* init CPU functions */
+    x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
+    x264_predict_8x8_init( h->param.cpu, h->predict_8x8 );
+    x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
+
+    x264_pixel_init( h->param.cpu, &h->pixf );
+    x264_dct_init( h->param.cpu, &h->dctf );
+    x264_mc_init( h->param.cpu, h->mc );
+    x264_csp_init( h->param.cpu, h->param.i_csp, &h->csp );
+
+    /* rate control */
+    h->rc = x264_ratecontrol_new( &h->param );
+
+    /* stat */
+    h->stat.i_slice_count[SLICE_TYPE_I] = 0;
+    h->stat.i_slice_count[SLICE_TYPE_P] = 0;
+    h->stat.i_slice_count[SLICE_TYPE_B] = 0;
+    h->stat.i_slice_size[SLICE_TYPE_I] = 0;
+    h->stat.i_slice_size[SLICE_TYPE_P] = 0;
+    h->stat.i_slice_size[SLICE_TYPE_B] = 0;
+
+    h->stat.f_psnr_y[SLICE_TYPE_I] = 0.0; h->stat.f_psnr_u[SLICE_TYPE_I] = 0.0; h->stat.f_psnr_v[SLICE_TYPE_I] = 0.0;
+    h->stat.f_psnr_y[SLICE_TYPE_P] = 0.0; h->stat.f_psnr_u[SLICE_TYPE_P] = 0.0; h->stat.f_psnr_v[SLICE_TYPE_P] = 0.0;
+    h->stat.f_psnr_y[SLICE_TYPE_B] = 0.0; h->stat.f_psnr_u[SLICE_TYPE_B] = 0.0; h->stat.f_psnr_v[SLICE_TYPE_B] = 0.0;
+
+    for( i = 0; i < 17; i++ )
+    {
+        h->stat.i_mb_count[SLICE_TYPE_I][i] = 0;
+        h->stat.i_mb_count[SLICE_TYPE_P][i] = 0;
+        h->stat.i_mb_count[SLICE_TYPE_B][i] = 0;
+    }
+    return h;
+}
+
+/* internal usage */
+static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
+{
+    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
+
+    nal->i_ref_idc = i_ref_idc;
+    nal->i_type    = i_type;
+
+    bs_align_0( &h->out.bs );   /* not needed */
+
+    nal->i_payload= 0;
+    nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs) / 8];
+}
+static void x264_nal_end( x264_t *h )
+{
+    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
+
+    bs_align_0( &h->out.bs );   /* not needed */
+
+    nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs)/8] - nal->p_payload;
+
+    h->out.i_nal++;
+}
+
+/****************************************************************************
+ * x264_encoder_headers:
+ ****************************************************************************/
+int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
+{
+    /* init bitstream context */
+    h->out.i_nal = 0;
+    bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+
+    /* Put SPS and PPS */
+    if( h->i_frame == 0 )
+    {
+        /* generate sequence parameters */
+        x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+        x264_sps_write( &h->out.bs, h->sps );
+        x264_nal_end( h );
+
+        /* generate picture parameters */
+        x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+        x264_pps_write( &h->out.bs, h->pps );
+        x264_nal_end( h );
+    }
+    /* now set output*/
+    *pi_nal = h->out.i_nal;
+    *pp_nal = &h->out.nal[0];
+
+    return 0;
+}
+
+
+static void x264_frame_put( x264_frame_t *list[X264_BFRAME_MAX], x264_frame_t *frame )
+{
+    int i = 0;
+
+    while( list[i] != NULL ) i++;
+
+    list[i] = frame;
+}
+
+static x264_frame_t *x264_frame_get( x264_frame_t *list[X264_BFRAME_MAX+1] )
+{
+    x264_frame_t *frame = list[0];
+    int i;
+
+    for( i = 0; i < X264_BFRAME_MAX; i++ )
+    {
+        list[i] = list[i+1];
+    }
+    list[X264_BFRAME_MAX] = NULL;
+
+    return frame;
+}
+
+static inline void x264_reference_build_list( x264_t *h, int i_poc )
+{
+    int i;
+    int b_ok;
+
+    /* build ref list 0/1 */
+    h->i_ref0 = 0;
+    h->i_ref1 = 0;
+    for( i = 1; i < h->param.i_frame_reference+2; i++ )
+    {
+        if( h->frames.reference[i]->i_poc >= 0 )
+        {
+            if( h->frames.reference[i]->i_poc < i_poc )
+            {
+                h->fref0[h->i_ref0++] = h->frames.reference[i];
+            }
+            else if( h->frames.reference[i]->i_poc > i_poc )
+            {
+                h->fref1[h->i_ref1++] = h->frames.reference[i];
+            }
+        }
+    }
+    /* Order ref0 from higher to lower poc */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref0 - 1; i++ )
+        {
+            if( h->fref0[i]->i_poc < h->fref0[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref0[i+1];
+
+                h->fref0[i+1] = h->fref0[i];
+                h->fref0[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+    /* Order ref1 from lower to higher poc (bubble sort) for B-frame */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref1 - 1; i++ )
+        {
+            if( h->fref1[i]->i_poc > h->fref1[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref1[i+1];
+
+                h->fref1[i+1] = h->fref1[i];
+                h->fref1[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+
+    if( h->i_ref0 > h->param.i_frame_reference )
+    {
+        h->i_ref0 = h->param.i_frame_reference;
+    }
+    if( h->i_ref1 > 1 )
+    {
+        h->i_ref1 = 1;
+    }
+}
+
+static inline void x264_reference_update( x264_t *h )
+{
+    int i;
+
+    /* apply deblocking filter to the current decoded picture */
+    if( h->param.b_deblocking_filter )
+    {
+        TIMER_START( i_mtime_filter );
+        x264_frame_deblocking_filter( h, h->sh.i_type );
+        TIMER_STOP( i_mtime_filter );
+    }
+    /* expand border */
+    x264_frame_expand_border( h->fdec );
+
+    /* move frame in the buffer */
+    h->fdec = h->frames.reference[h->param.i_frame_reference+1];
+    for( i = h->param.i_frame_reference+1; i > 0; i-- )
+    {
+        h->frames.reference[i] = h->frames.reference[i-1];
+    }
+    h->frames.reference[0] = h->fdec;
+}
+
+static inline void x264_reference_reset( x264_t *h )
+{
+    int i;
+
+    /* reset ref pictures */
+    for( i = 1; i < h->param.i_frame_reference+2; i++ )
+    {
+        h->frames.reference[i]->i_poc = -1;
+    }
+    h->frames.reference[0]->i_poc = 0;
+}
+
+static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_slice_type, int i_global_qp )
+{
+    /* ------------------------ Create slice header  ----------------------- */
+    if( i_nal_type == NAL_SLICE_IDR )
+    {
+        x264_slice_header_init( &h->sh, &h->param, h->sps, h->pps, i_slice_type, h->i_idr_pic_id, h->i_frame_num - 1 );
+
+        /* increment id */
+        h->i_idr_pic_id = ( h->i_idr_pic_id + 1 ) % 65535;
+    }
+    else
+    {
+        x264_slice_header_init( &h->sh, &h->param, h->sps, h->pps, i_slice_type, -1, h->i_frame_num - 1 );
+
+        /* always set the real higher num of ref frame used */
+        h->sh.b_num_ref_idx_override = 1;
+        h->sh.i_num_ref_idx_l0_active = h->i_ref0 <= 0 ? 1 : h->i_ref0;
+        h->sh.i_num_ref_idx_l1_active = h->i_ref1 <= 0 ? 1 : h->i_ref1;
+    }
+
+    if( h->sps->i_poc_type == 0 )
+    {
+        h->sh.i_poc_lsb = h->fdec->i_poc & ( (1 << h->sps->i_log2_max_poc_lsb) - 1 );
+        h->sh.i_delta_poc_bottom = 0;   /* XXX won't work for field */
+    }
+    else if( h->sps->i_poc_type == 1 )
+    {
+        /* FIXME TODO FIXME */
+    }
+    else
+    {
+        /* Nothing to do ? */
+    }
+
+    /* global qp */
+    h->sh.i_qp_delta = i_global_qp - h->pps->i_pic_init_qp;
+
+    /* get adapative cabac model if needed */
+    if( h->param.b_cabac )
+    {
+        if( h->param.i_cabac_init_idc == -1 )
+        {
+            h->sh.i_cabac_init_idc = x264_cabac_model_get( &h->cabac, i_slice_type );
+        }
+    }
+}
+
+static inline void x264_slice_write( x264_t *h, int i_nal_type, int i_nal_ref_idc, int i_mb_count[18] )
+{
+    int i_skip;
+    int mb_xy;
+    int i;
+
+    /* Init stats */
+    for( i = 0; i < 17; i++ ) i_mb_count[i] = 0;
+
+    /* Slice */
+    x264_nal_start( h, i_nal_type, i_nal_ref_idc );
+
+    /* Slice header */
+    x264_slice_header_write( &h->out.bs, &h->sh, i_nal_ref_idc );
+    if( h->param.b_cabac )
+    {
+        /* alignement needed */
+        bs_align_1( &h->out.bs );
+
+        /* init cabac */
+        x264_cabac_context_init( &h->cabac, h->sh.i_type, h->sh.pps->i_pic_init_qp + h->sh.i_qp_delta, h->sh.i_cabac_init_idc );
+        x264_cabac_encode_init ( &h->cabac, &h->out.bs );
+    }
+    h->mb.i_last_qp = h->pps->i_pic_init_qp + h->sh.i_qp_delta;
+    h->mb.i_last_dqp = 0;
+
+    for( mb_xy = 0, i_skip = 0; mb_xy < h->sps->i_mb_width * h->sps->i_mb_height; mb_xy++ )
+    {
+        const int i_mb_y = mb_xy / h->sps->i_mb_width;
+        const int i_mb_x = mb_xy % h->sps->i_mb_width;
+
+        /* load cache */
+        x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
+
+        /* analyse parameters
+         * Slice I: choose I_4x4 or I_16x16 mode
+         * Slice P: choose between using P mode or intra (4x4 or 16x16)
+         * */
+        TIMER_START( i_mtime_analyse );
+        x264_macroblock_analyse( h );
+        TIMER_STOP( i_mtime_analyse );
+
+        /* encode this macrobock -> be carefull it can change the mb type to P_SKIP if needed */
+        TIMER_START( i_mtime_encode );
+        x264_macroblock_encode( h );
+        TIMER_STOP( i_mtime_encode );
+
+        TIMER_START( i_mtime_write );
+        if( IS_SKIP( h->mb.i_type ) )
+        {
+            if( h->param.b_cabac )
+            {
+                if( mb_xy > 0 )
+                {
+                    /* not end_of_slice_flag */
+                    x264_cabac_encode_terminal( &h->cabac, 0 );
+                }
+
+                x264_cabac_mb_skip( h, 1 );
+            }
+            else
+            {
+                i_skip++;
+            }
+        }
+        else
+        {
+            if( h->param.b_cabac )
+            {
+                if( mb_xy > 0 )
+                {
+                    /* not end_of_slice_flag */
+                    x264_cabac_encode_terminal( &h->cabac, 0 );
+                }
+                if( h->sh.i_type != SLICE_TYPE_I )
+                {
+                    x264_cabac_mb_skip( h, 0 );
+                }
+                x264_macroblock_write_cabac( h, &h->out.bs );
+            }
+            else
+            {
+                if( h->sh.i_type != SLICE_TYPE_I )
+                {
+                    bs_write_ue( &h->out.bs, i_skip );  /* skip run */
+                    i_skip = 0;
+                }
+                x264_macroblock_write_cavlc( h, &h->out.bs );
+            }
+        }
+        TIMER_STOP( i_mtime_write );
+
+        /* save cache */
+        x264_macroblock_cache_save( h );
+
+        i_mb_count[h->mb.i_type]++;
+    }
+
+    if( h->param.b_cabac )
+    {
+        /* end of slice */
+        x264_cabac_encode_terminal( &h->cabac, 1 );
+    }
+    else if( i_skip > 0 )
+    {
+        bs_write_ue( &h->out.bs, i_skip );  /* last skip run */
+    }
+
+    if( h->param.b_cabac )
+    {
+        int i_cabac_word;
+        x264_cabac_encode_flush( &h->cabac );
+        /* TODO cabac stuffing things (p209) */
+        i_cabac_word = (((3 * h->cabac.i_sym_cnt - 3 * 96 * h->sps->i_mb_width * h->sps->i_mb_height)/32) - bs_pos( &h->out.bs)/8)/3;
+
+        while( i_cabac_word > 0 )
+        {
+            bs_write( &h->out.bs, 16, 0x0000 );
+            i_cabac_word--;
+        }
+    }
+    else
+    {
+        /* rbsp_slice_trailing_bits */
+        bs_rbsp_trailing( &h->out.bs );
+    }
+
+    x264_nal_end( h );
+}
+
+/****************************************************************************
+ * x264_encoder_encode:
+ *  XXX: i_poc   : is the poc of the current given picture
+ *       i_frame : is the number of the frame being coded
+ *  ex:  type frame poc
+ *       I      0   2*0
+ *       P      1   2*3
+ *       B      2   2*1
+ *       B      3   2*2
+ *       P      4   2*6
+ *       B      5   2*4
+ *       B      6   2*5
+ ****************************************************************************/
+int     x264_encoder_encode( x264_t *h,
+                             x264_nal_t **pp_nal, int *pi_nal,
+                             x264_picture_t *pic )
+{
+    x264_frame_t   *frame_psnr = h->fdec; /* just to kept the current decoded frame for psnr calculation */
+    int     i_nal_type;
+    int     i_nal_ref_idc;
+    int     i_slice_type;
+
+    int i;
+
+    float psnr_y, psnr_u, psnr_v;
+
+    int   i_global_qp;
+
+    int i_mb_count[18];
+
+    /* no data out */
+    *pi_nal = 0;
+    *pp_nal = NULL;
+
+
+    /* ------------------- Setup new frame from picture -------------------- */
+    TIMER_START( i_mtime_encode_frame );
+    if( pic != NULL )
+    {
+        /* Copy the picture to a frame, init the frame and move it to a buffer */
+        /* 1: get a frame */
+        x264_frame_t *fenc = x264_frame_get( h->frames.unused );
+
+        x264_frame_copy_picture( h, fenc, pic );
+
+        /* 2: get its type */
+        if( ( h->frames.i_last_i + 1 >= h->param.i_iframe && h->frames.i_last_idr + 1 >= h->param.i_idrframe ) ||
+            pic->i_type == X264_TYPE_IDR )
+        {
+            /* IDR */
+            fenc->i_type = X264_TYPE_IDR;
+
+            h->i_poc       = 0;
+            h->i_frame_num = 0;
+
+            /* Last schedule B frames need to be encoded as P */
+            if( h->frames.next[0] != NULL )
+            {
+                x264_frame_t *tmp;
+                int i = 0;
+
+                while( h->frames.next[i+1] != NULL ) i++;
+                h->frames.next[i]->i_type = X264_TYPE_P;
+
+                /* remove this P from next */
+                tmp = h->frames.next[i];
+                h->frames.next[i] = NULL;
+
+                /* move this P + Bs to current */
+                x264_frame_put( h->frames.current, tmp );
+                while( ( tmp = x264_frame_get( h->frames.next ) ) )
+                {
+                    x264_frame_put( h->frames.current, tmp );
+                }
+            }
+        }
+        else if( h->param.i_bframe > 0 )
+        {
+            if( h->frames.i_last_i  + 1 >= h->param.i_iframe )
+                fenc->i_type = X264_TYPE_I;
+            else if( h->frames.next[h->param.i_bframe-1] != NULL )
+                fenc->i_type = X264_TYPE_P;
+            else if( pic->i_type == X264_TYPE_AUTO )
+                fenc->i_type = X264_TYPE_B;
+            else
+                fenc->i_type = pic->i_type;
+        }
+        else
+        {
+            if( pic->i_type == X264_TYPE_AUTO )
+            {
+                if( h->frames.i_last_i + 1 >= h->param.i_iframe )
+                    fenc->i_type = X264_TYPE_I;
+                else
+                    fenc->i_type = X264_TYPE_P;
+            }
+            else
+            {
+                fenc->i_type = pic->i_type;
+            }
+        }
+
+        fenc->i_poc = h->i_poc;
+        if( fenc->i_type == X264_TYPE_IDR )
+        {
+            h->frames.i_last_idr = 0;
+            h->frames.i_last_i = 0;
+        }
+        else if( fenc->i_type == X264_TYPE_I )
+        {
+            h->frames.i_last_idr++;
+            h->frames.i_last_i = 0;
+        }
+        else
+        {
+            h->frames.i_last_i++;
+        }
+
+        /* 3: Update current/next */
+        if( fenc->i_type == X264_TYPE_B )
+        {
+            x264_frame_put( h->frames.next, fenc );
+        }
+        else
+        {
+            x264_frame_put( h->frames.current, fenc );
+            while( ( fenc = x264_frame_get( h->frames.next ) ) )
+            {
+                x264_frame_put( h->frames.current, fenc );
+            }
+        }
+        h->i_poc += 2;
+    }
+    else    /* No more picture, begin encoding of last frames */
+    {
+        /* Move all next frames to current and mark the last one as a P */
+        x264_frame_t *tmp;
+        int i = -1;
+        while( h->frames.next[i+1] != NULL ) i++;
+        if( i >= 0 )
+        {
+            h->frames.next[i]->i_type = X264_TYPE_P;
+            tmp = h->frames.next[i];
+            h->frames.next[i] = NULL;
+
+            x264_frame_put( h->frames.current, tmp );
+            while( ( tmp = x264_frame_get( h->frames.next ) ) )
+            {
+                x264_frame_put( h->frames.current, tmp );
+            }
+        }
+    }
+    TIMER_STOP( i_mtime_encode_frame );
+
+    /* ------------------- Get frame to be encoded ------------------------- */
+    /* 4: get picture to encode */
+    h->fenc = x264_frame_get( h->frames.current );
+    if( h->fenc == NULL )
+    {
+        /* Nothing yet to encode (ex: waiting for I/P with B frames) */
+        /* waiting for filling bframe buffer */
+        pic->i_type = X264_TYPE_AUTO;
+        return 0;
+    }
+    x264_frame_put( h->frames.unused, h->fenc );  /* Safe to do it now, we don't use frames.unused for the rest */
+
+    /* ------------------- Setup frame context ----------------------------- */
+    /* 5: Init data dependant of frame type */
+    TIMER_START( i_mtime_encode_frame );
+    if( h->fenc->i_type == X264_TYPE_IDR )
+    {
+        /* reset ref pictures */
+        x264_reference_reset( h );
+
+        i_nal_type    = NAL_SLICE_IDR;
+        i_nal_ref_idc = NAL_PRIORITY_HIGHEST;
+        i_slice_type = SLICE_TYPE_I;
+    }
+    else if( h->fenc->i_type == X264_TYPE_I )
+    {
+        i_nal_type    = NAL_SLICE;
+        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
+        i_slice_type = SLICE_TYPE_I;
+    }
+    else if( h->fenc->i_type == X264_TYPE_P )
+    {
+        i_nal_type    = NAL_SLICE;
+        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
+        i_slice_type = SLICE_TYPE_P;
+    }
+    else    /* B frame */
+    {
+        i_nal_type    = NAL_SLICE;
+        i_nal_ref_idc = NAL_PRIORITY_DISPOSABLE;
+        i_slice_type = SLICE_TYPE_B;
+    }
+
+    pic->i_type     =
+    h->fdec->i_type = h->fenc->i_type;
+    h->fdec->i_poc  = h->fenc->i_poc;
+
+
+
+    /* ------------------- Init                ----------------------------- */
+    /* Init the rate control */
+    x264_ratecontrol_start( h->rc, i_slice_type );
+    i_global_qp = x264_ratecontrol_qp( h->rc );
+    if( h->fenc->i_qpplus1 > 0 )
+    {
+        i_global_qp = x264_clip3( h->fenc->i_qpplus1 - 1, 0, 51 );
+    }
+
+    /* build ref list 0/1 */
+    x264_reference_build_list( h, h->fdec->i_poc );
+
+    /* increase frame num but only once for B frame */
+    if( i_slice_type != SLICE_TYPE_B || h->sh.i_type != SLICE_TYPE_B )
+    {
+        h->i_frame_num++;
+    }
+
+    /* ------------------------ Create slice header  ----------------------- */
+    x264_slice_init( h, i_nal_type, i_slice_type, i_global_qp );
+
+    /* ---------------------- Write the bitstream -------------------------- */
+    /* Init bitstream context */
+    h->out.i_nal = 0;
+    bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+
+    /* Write SPS and PPS */
+    if( i_nal_type == NAL_SLICE_IDR )
+    {
+        /* generate sequence parameters */
+        x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+        x264_sps_write( &h->out.bs, h->sps );
+        x264_nal_end( h );
+
+        /* generate picture parameters */
+        x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+        x264_pps_write( &h->out.bs, h->pps );
+        x264_nal_end( h );
+    }
+
+    /* Write the slice */
+    x264_slice_write( h, i_nal_type, i_nal_ref_idc, i_mb_count );
+
+    /* End bitstream, set output  */
+    *pi_nal = h->out.i_nal;
+    *pp_nal = &h->out.nal[0];
+
+    /* Set output picture properties */
+    if( i_slice_type == SLICE_TYPE_I )
+        pic->i_type = i_nal_type == NAL_SLICE_IDR ? X264_TYPE_IDR : X264_TYPE_I;
+    else if( i_slice_type == SLICE_TYPE_P )
+        pic->i_type = X264_TYPE_P;
+    else
+        pic->i_type = X264_TYPE_B;
+    pic->i_pts = h->fenc->i_pts;
+
+    /* ---------------------- Update encoder state ------------------------- */
+    /* update cabac */
+    if( h->param.b_cabac )
+    {
+        x264_cabac_model_update( &h->cabac, i_slice_type, h->sh.pps->i_pic_init_qp + h->sh.i_qp_delta );
+    }
+
+    /* handle references */
+    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
+    {
+        x264_reference_update( h );
+    }
+
+    /* increase frame count */
+    h->i_frame++;
+
+    /* update rc */
+    x264_ratecontrol_end( h->rc, h->out.nal[h->out.i_nal-1].i_payload * 8 );
+
+    /* restore CPU state (before using float again) */
+    x264_cpu_restore( h->param.cpu );
+
+    TIMER_STOP( i_mtime_encode_frame );
+
+    /* ---------------------- Compute/Print statistics --------------------- */
+    /* PSNR */
+    psnr_y = x264_psnr( frame_psnr->plane[0], frame_psnr->i_stride[0], h->fenc->plane[0], h->fenc->i_stride[0], h->param.i_width, h->param.i_height );
+    psnr_u = x264_psnr( frame_psnr->plane[1], frame_psnr->i_stride[1], h->fenc->plane[1], h->fenc->i_stride[1], h->param.i_width/2, h->param.i_height/2);
+    psnr_v = x264_psnr( frame_psnr->plane[2], frame_psnr->i_stride[2], h->fenc->plane[2], h->fenc->i_stride[2], h->param.i_width/2, h->param.i_height/2);
+
+    /* Slice stat */
+    h->stat.i_slice_count[i_slice_type]++;
+    h->stat.i_slice_size[i_slice_type] += bs_pos( &h->out.bs) / 8;
+    h->stat.f_psnr_y[i_slice_type] += psnr_y;
+    h->stat.f_psnr_u[i_slice_type] += psnr_u;
+    h->stat.f_psnr_v[i_slice_type] += psnr_v;
+
+    for( i = 0; i < 17; i++ )
+    {
+        h->stat.i_mb_count[h->sh.i_type][i] += i_mb_count[i];
+    }
+
+    /* print stat */
+    fprintf( stderr, "frame=%4d NAL=%d Slice:%c Poc:%-3d I4x4:%-5d I16x16:%-5d P:%-5d SKIP:%-3d size=%d bytes PSNR Y:%2.2f U:%2.2f V:%2.2f\n",
+             h->i_frame - 1,
+             i_nal_ref_idc,
+             i_slice_type == SLICE_TYPE_I ? 'I' : (i_slice_type == SLICE_TYPE_P ? 'P' : 'B' ),
+             frame_psnr->i_poc,
+             i_mb_count[I_4x4],
+             i_mb_count[I_16x16],
+             i_mb_count[P_L0] + i_mb_count[P_8x8],
+             i_mb_count[P_SKIP],
+             h->out.nal[h->out.i_nal-1].i_payload,
+             psnr_y, psnr_u, psnr_v );
+#ifdef DEBUG_MB_TYPE
+    for( mb_xy = 0; mb_xy < h->sps->i_mb_width * h->sps->i_mb_height; mb_xy++ )
+    {
+        const int i_mb_y = mb_xy / h->sps->i_mb_width;
+        const int i_mb_x = mb_xy % h->sps->i_mb_width;
+
+        if( i_mb_y > 0 && i_mb_x == 0 )
+            fprintf( stderr, "\n" );
+
+        if( h->mb.type[mb_xy] == I_4x4 )
+            fprintf( stderr, "i" );
+        else if( h->mb.type[mb_xy] == I_16x16 )
+            fprintf( stderr, "I" );
+        else if( h->mb.type[mb_xy] == P_SKIP )
+            fprintf( stderr, "S" );
+        else if( h->mb.type[mb_xy] == P_8x8 )
+            fprintf( stderr, "8" );
+        else if( h->mb.type[mb_xy] == P_L0 )
+            fprintf( stderr, "P" );
+        else
+            fprintf( stderr, "?" );
+
+        fprintf( stderr, " " );
+    }
+#endif
+
+#if DEBUG_DUMP_FRAME
+    /* Dump reconstructed frame */
+    x264_frame_dump( h, frame_psnr, "fdec.yuv" );
+#endif
+#if 0
+    if( h->i_ref0 > 0 )
+    {
+        x264_frame_dump( h, h->fref0[0], "ref0.yuv" );
+    }
+    if( h->i_ref1 > 0 )
+    {
+        x264_frame_dump( h, h->fref1[0], "ref1.yuv" );
+    }
+#endif
+    return 0;
+}
+
+/****************************************************************************
+ * x264_encoder_close:
+ ****************************************************************************/
+void    x264_encoder_close  ( x264_t *h )
+{
+    int64_t i_mtime_total = i_mtime_analyse + i_mtime_encode + i_mtime_write + i_mtime_filter + 1;
+    int i;
+
+    fprintf( stderr, "x264: analyse=%d(%lldms) encode=%d(%lldms) write=%d(%lldms) filter=%d(%lldms)\n",
+             (int)(100*i_mtime_analyse/i_mtime_total), i_mtime_analyse/1000,
+             (int)(100*i_mtime_encode/i_mtime_total), i_mtime_encode/1000,
+             (int)(100*i_mtime_write/i_mtime_total), i_mtime_write/1000,
+             (int)(100*i_mtime_filter/i_mtime_total), i_mtime_filter/1000 );
+
+    /* Slices used and PNSR */
+    if( h->stat.i_slice_count[SLICE_TYPE_I] > 0 )
+    {
+        const int i_count = h->stat.i_slice_count[SLICE_TYPE_I];
+        fprintf( stderr, "x264: slice I:%-4d Avg size:%-5d PSNR Y:%2.2f U:%2.2f V:%2.2f PSNR-Y/Size:%2.2f\n",
+                 i_count,
+                 h->stat.i_slice_size[SLICE_TYPE_I] / i_count,
+                 h->stat.f_psnr_y[SLICE_TYPE_I] / i_count,
+                 h->stat.f_psnr_u[SLICE_TYPE_I] / i_count,
+                 h->stat.f_psnr_v[SLICE_TYPE_I] / i_count,
+                 1000*h->stat.f_psnr_y[SLICE_TYPE_I] / h->stat.i_slice_size[SLICE_TYPE_I] );
+    }
+    if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 )
+    {
+        const int i_count = h->stat.i_slice_count[SLICE_TYPE_P];
+        fprintf( stderr, "x264: slice P:%-4d Avg size:%-5d PSNR Y:%2.2f U:%2.2f V:%2.2f PSNR-Y/Size:%2.2f\n",
+                i_count,
+                h->stat.i_slice_size[SLICE_TYPE_P] / i_count,
+                h->stat.f_psnr_y[SLICE_TYPE_P] / i_count,
+                h->stat.f_psnr_u[SLICE_TYPE_P] / i_count,
+                h->stat.f_psnr_v[SLICE_TYPE_P] / i_count,
+                1000.0*h->stat.f_psnr_y[SLICE_TYPE_P] / h->stat.i_slice_size[SLICE_TYPE_P] );
+    }
+    if( h->stat.i_slice_count[SLICE_TYPE_B] > 0 )
+    {
+        fprintf( stderr, "x264: slice B:%-4d Avg size:%-5d PSNR Y:%2.2f U:%2.2f V:%2.2f PSNR-Y/Size:%2.2f\n",
+                h->stat.i_slice_count[SLICE_TYPE_B],
+                h->stat.i_slice_size[SLICE_TYPE_B] / h->stat.i_slice_count[SLICE_TYPE_B],
+                h->stat.f_psnr_y[SLICE_TYPE_B] / h->stat.i_slice_count[SLICE_TYPE_B],
+                h->stat.f_psnr_u[SLICE_TYPE_B] / h->stat.i_slice_count[SLICE_TYPE_B],
+                h->stat.f_psnr_v[SLICE_TYPE_B] / h->stat.i_slice_count[SLICE_TYPE_B],
+                1000*h->stat.f_psnr_y[SLICE_TYPE_B] / h->stat.i_slice_size[SLICE_TYPE_B] );
+    }
+
+    /* MB types used */
+    if( h->stat.i_slice_count[SLICE_TYPE_I] > 0 )
+    {
+        const int i_count =  h->stat.i_slice_count[SLICE_TYPE_I];
+        fprintf( stderr, "x264: slice I      Avg I4x4:%-5d I16x16:%-5d\n",
+                 h->stat.i_mb_count[SLICE_TYPE_I][I_4x4]  / i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_I][I_16x16]/ i_count );
+    }
+    if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 )
+    {
+        const int i_count = h->stat.i_slice_count[SLICE_TYPE_P];
+        fprintf( stderr, "x264: slice P      Avg I4x4:%-5d I16x16:%-5d P:%-5d P8x8:%-5d PSKIP:%-5d\n",
+                 h->stat.i_mb_count[SLICE_TYPE_P][I_4x4]  / i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_P][I_16x16]/ i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_P][P_L0] / i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_P][P_8x8] / i_count,
+                 h->stat.i_mb_count[SLICE_TYPE_P][P_SKIP] /i_count );
+    }
+
+    {
+        const int i_count = h->stat.i_slice_count[SLICE_TYPE_I] +
+                            h->stat.i_slice_count[SLICE_TYPE_P] +
+                            h->stat.i_slice_count[SLICE_TYPE_B];
+
+        fprintf( stderr, "x264: overall PSNR Y:%2.2f U:%2.2f V:%2.2f kb/s:%.1f fps:%.3f\n",
+                 (h->stat.f_psnr_y[SLICE_TYPE_I]+h->stat.f_psnr_y[SLICE_TYPE_P]+h->stat.f_psnr_y[SLICE_TYPE_B]) / i_count,
+                 (h->stat.f_psnr_u[SLICE_TYPE_I]+h->stat.f_psnr_u[SLICE_TYPE_P]+h->stat.f_psnr_u[SLICE_TYPE_B]) / i_count,
+                 (h->stat.f_psnr_v[SLICE_TYPE_I]+h->stat.f_psnr_v[SLICE_TYPE_P]+h->stat.f_psnr_v[SLICE_TYPE_B]) / i_count,
+                 h->param.f_fps * 8*(h->stat.i_slice_size[SLICE_TYPE_I]+h->stat.i_slice_size[SLICE_TYPE_P]+h->stat.i_slice_size[SLICE_TYPE_B]) / i_count / 1024,
+                 (double)1000000.0 * (double)i_count / (double)i_mtime_encode_frame );
+    }
+
+    /* frames */
+    for( i = 0; i < X264_BFRAME_MAX + 1; i++ )
+    {
+        if( h->frames.current[i] ) x264_frame_delete( h->frames.current[i] );
+        if( h->frames.next[i] )    x264_frame_delete( h->frames.next[i] );
+        if( h->frames.unused[i] )  x264_frame_delete( h->frames.unused[i] );
+    }
+    /* ref frames */
+    for( i = 0; i < h->param.i_frame_reference+2; i++ )
+    {
+        x264_frame_delete( h->frames.reference[i] );
+    }
+
+    /* rc */
+    x264_ratecontrol_delete( h->rc );
+
+    x264_macroblock_cache_end( h );
+    x264_free( h->out.p_bitstream );
+    x264_free( h );
+}
+
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
new file mode 100644
index 00000000..353f3d4b
--- /dev/null
+++ b/encoder/macroblock.c
@@ -0,0 +1,859 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "macroblock.h"
+
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int quant_mf[6][4][4] =
+{
+    {  { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243},
+       { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}  },
+    {  { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660},
+       { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}  },
+    {  { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194},
+       { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}  },
+    {  {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647},
+       {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}  },
+    {  {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355},
+       {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}  },
+    {  {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893},
+       {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}  }
+};
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+//static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+//static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[2][0];
+    level[4] = dct[1][1];
+    level[5] = dct[0][2];
+    level[6] = dct[0][3];
+    level[7] = dct[1][2];
+    level[8] = dct[2][1];
+    level[9] = dct[3][0];
+    level[10] = dct[3][1];
+    level[11] = dct[2][2];
+    level[12] = dct[1][3];
+    level[13] = dct[2][3];
+    level[14] = dct[3][2];
+    level[15] = dct[3][3];
+#if 0
+    int i;
+    for( i = 0; i < 16; i++ )
+    {
+        level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+#endif
+}
+static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
+{
+    level[0] = dct[0][1];
+    level[1] = dct[1][0];
+    level[2] = dct[2][0];
+    level[3] = dct[1][1];
+    level[4] = dct[0][2];
+    level[5] = dct[0][3];
+    level[6] = dct[1][2];
+    level[7] = dct[2][1];
+    level[8] = dct[3][0];
+    level[9] = dct[3][1];
+    level[10] = dct[2][2];
+    level[11] = dct[1][3];
+    level[12] = dct[2][3];
+    level[13] = dct[3][2];
+    level[14] = dct[3][3];
+#if 0
+    int i;
+    for( i = 1; i < 16; i++ )
+    {
+        level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+#endif
+}
+
+static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[1][1];
+}
+
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+    const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / 3;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / ( b_intra ? 3 : 6 );
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+#if 0
+/* From a JVT doc */
+static const int f_deadzone_intra[4][4][2] = /* [num][den] */
+{
+    { {1,2}, {3,7}, {2,5}, {1,3} },
+    { {3,7}, {2,5}, {1,3}, {1,4} },
+    { {2,5}, {1,3}, {1,4}, {1,5} },
+    { {1,3}, {1,4}, {1,5}, {1,5} }
+};
+static const int f_deadzone_inter[4][4][2] = /* [num][den] */
+{
+    { {1,3}, {2,7}, {4,15},{2,9} },
+    { {2,7}, {4,15},{2,9}, {1,6} },
+    { {4,15},{2,9}, {1,6}, {1,7} },
+    { {2,9}, {1,6}, {1,7}, {2,15} }
+};
+
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int(*f_deadzone)[4][4][2] = b_intra ? &f_deadzone_intra : &f_deadzone_inter;
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+#if 0
+            const int f = b_intra ?
+                          (f_deadzone_intra[y][x][0] * ( 1 << i_qbits ) / f_deadzone_intra[y][x][1])
+                          :
+                          (f_deadzone_inter[y][x][0] * ( 1 << i_qbits ) / f_deadzone_inter[y][x][1]);
+#else
+            const int f = (*f_deadzone)[y][x][0] * ( 1 << i_qbits ) / (*f_deadzone)[y][x][1];
+#endif
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = b_intra ?
+                   (f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1])
+                   :
+                   (f_deadzone_inter[0][0][0] * ( 2 << i_qbits ) / f_deadzone_inter[0][0][1]);
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+
+#endif
+
+static inline int array_non_zero_count( int *v, int i_count )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < i_count; i++ )
+    {
+        if( v[i] )
+        {
+            i_nz++;
+        }
+    }
+    return i_nz;
+}
+
+/* (ref: JVT-B118)
+ * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
+ * to 0 (low score means set it to null)
+ * Used in inter macroblock (luma and chroma)
+ *  luma: for a 8x8 block: if score < 4 -> null
+ *        for the complete mb: if score < 6 -> null
+ *  chroma: for the complete mb: if score < 7 -> null
+ */
+static int x264_mb_decimate_score( int *dct, int i_max )
+{
+    static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int i_score = 0;
+    int idx = i_max - 1;
+
+    while( idx >= 0 && dct[idx] == 0 )
+    {
+        idx--;
+    }
+
+    while( idx >= 0 )
+    {
+        int i_run;
+
+        if( abs( dct[idx--] ) > 1 )
+        {
+            return 9;
+        }
+
+        i_run = 0;
+        while( idx >= 0 && dct[idx] == 0 )
+        {
+            idx--;
+            i_run++;
+        }
+        i_score += i_ds_table[i_run];
+    }
+
+    return i_score;
+}
+
+void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
+{
+    const int i_stride = h->mb.pic.i_stride[0];
+    uint8_t  *p_src = &h->mb.pic.p_fenc[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride];
+    uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride];
+
+    int16_t dct4x4[4][4];
+
+    h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+
+    quant_4x4( dct4x4, i_qscale, 1 );
+
+    scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
+
+    x264_mb_dequant_4x4( dct4x4, i_qscale );
+
+    /* output samples to fdec */
+    h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
+}
+
+static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
+{
+    const int i_stride = h->mb.pic.i_stride[0];
+    uint8_t  *p_src = h->mb.pic.p_fenc[0];
+    uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+
+    int16_t dct4x4[16+1][4][4];
+
+    int i;
+
+    h->dctf.sub16x16_dct( &dct4x4[1], p_src, i_stride, p_dst, i_stride );
+    for( i = 0; i < 16; i++ )
+    {
+        /* copy dc coeff */
+        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
+
+        /* quant/scan/dequant */
+        quant_4x4( dct4x4[1+i], i_qscale, 1 );
+        scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
+        x264_mb_dequant_4x4( dct4x4[1+i], i_qscale );
+    }
+
+    h->dctf.dct4x4dc( dct4x4[0] );
+    quant_4x4_dc( dct4x4[0], i_qscale );
+    scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
+
+    /* output samples to fdec */
+    h->dctf.idct4x4dc( dct4x4[0] );
+    x264_mb_dequant_4x4_dc( dct4x4[0], i_qscale );  /* XXX not inversed */
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        /* copy dc coeff */
+        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+    }
+    /* put pixels to fdec */
+    h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] );
+}
+
+static void x264_mb_encode_8x8( x264_t *h, int b_inter, int i_qscale )
+{
+    int i, ch;
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        const int i_stride = h->mb.pic.i_stride[1+ch];
+        uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
+        uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
+        int i_decimate_score = 0;
+
+        int16_t dct2x2[2][2];
+        int16_t dct4x4[4][4][4];
+
+        h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            /* copy dc coeff */
+            dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+
+            quant_4x4( dct4x4[i], i_qscale, b_inter ? 0 : 1 );
+            scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
+            x264_mb_dequant_4x4( dct4x4[i], i_qscale );
+
+            if( b_inter )
+            {
+                i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
+            }
+        }
+
+        h->dctf.dct2x2dc( dct2x2 );
+        quant_2x2_dc( dct2x2, i_qscale, b_inter ? 0 : 1 );
+        scan_zigzag_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+
+        /* output samples to fdec */
+        h->dctf.idct2x2dc( dct2x2 );
+        x264_mb_dequant_2x2_dc( dct2x2, i_qscale );  /* XXX not inversed */
+
+        if( b_inter && i_decimate_score < 7 )
+        {
+            /* Near null chroma 8x8 block so make it null (bits saving) */
+            for( i = 0; i < 4; i++ )
+            {
+                int x, y;
+                for( x = 0; x < 15; x++ )
+                {
+                    h->dct.block[16+i+ch*4].residual_ac[x] = 0;
+                }
+                for( x = 0; x < 4; x++ )
+                {
+                    for( y = 0; y < 4; y++ )
+                    {
+                        dct4x4[i][x][y] = 0;
+                    }
+                }
+            }
+        }
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            /* copy dc coeff */
+            dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+        }
+        h->dctf.add8x8_idct( p_dst, i_stride, dct4x4 );
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_encode_pskip:
+ *  Encode an already marked skip block
+ *****************************************************************************/
+void x264_macroblock_encode_pskip( x264_t *h )
+{
+    const int mvx = h->mb.cache.mv[0][x264_scan8[0]][0];
+    const int mvy = h->mb.cache.mv[0][x264_scan8[0]][1];
+    int i;
+
+    /* Motion compensation XXX probably unneeded */
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
+                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
+                    mvx, mvy, 16, 16 );
+
+    /* Chroma MC */
+    h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][1], h->mb.pic.i_stride[1],
+                      h->mb.pic.p_fdec[1],       h->mb.pic.i_stride[1],
+                      mvx, mvy, 8, 8 );
+
+    h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][2], h->mb.pic.i_stride[2],
+                      h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
+                      mvx, mvy, 8, 8 );
+
+    h->mb.i_cbp_luma = 0x00;
+    h->mb.i_cbp_chroma = 0x00;
+
+    for( i = 0; i < 16+8; i++ )
+    {
+        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
+    }
+
+    /* store cbp */
+    h->mb.cbp[h->mb.i_mb_xy] = 0;
+}
+
+/*****************************************************************************
+ * x264_macroblock_encode:
+ *****************************************************************************/
+void x264_macroblock_encode( x264_t *h )
+{
+    int i_cbp_dc = 0;
+    int i_qscale;
+    int i;
+
+    if( h->mb.i_type == P_SKIP )
+    {
+        /* A bit special */
+        x264_macroblock_encode_pskip( h );
+        return;
+    }
+
+    /* quantification scale */
+    i_qscale = h->mb.qp[h->mb.i_mb_xy];
+
+    if( h->mb.i_type == I_16x16 )
+    {
+        const int i_mode = h->mb.i_intra16x16_pred_mode;
+        /* do the right prediction */
+        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+        /* encode the 16x16 macroblock */
+        x264_mb_encode_i16x16( h, i_qscale );
+
+        /* fix the pred mode value */
+        h->mb.i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix[i_mode];
+    }
+    else if( h->mb.i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            const int i_dst = h->mb.pic.i_stride[0];
+            uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * i_dst];
+            int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+
+            /* Do the right prediction */
+            h->predict_4x4[i_mode]( p_dst, i_dst );
+
+            /* encode one 4x4 block */
+            x264_mb_encode_i4x4( h, i, i_qscale );
+
+            /* fix the pred mode value */
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = x264_mb_pred_mode4x4_fix[i_mode];
+        }
+    }
+    else    /* Inter MB */
+    {
+        int16_t dct4x4[16][4][4];
+
+        int i8x8, i4x4, idx;
+        int i_decimate_mb = 0;
+
+        /* Motion compensation */
+        x264_mb_mc( h );
+
+        h->dctf.sub16x16_dct( dct4x4,
+                              h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                              h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+        for( i8x8 = 0; i8x8 < 4; i8x8++ )
+        {
+            int i_decimate_8x8;
+
+            /* encode one 4x4 block */
+            i_decimate_8x8 = 0;
+            for( i4x4 = 0; i4x4 < 4; i4x4++ )
+            {
+                idx = i8x8 * 4 + i4x4;
+
+                quant_4x4( dct4x4[idx], i_qscale, 0 );
+                scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
+                x264_mb_dequant_4x4( dct4x4[idx], i_qscale );
+
+                i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
+            }
+
+            /* decimate this 8x8 block */
+            i_decimate_mb += i_decimate_8x8;
+            if( i_decimate_8x8 < 4 )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    int x, y;
+                    idx = i8x8 * 4 + i4x4;
+                    for( i = 0; i < 16; i++ )
+                    {
+                        h->dct.block[idx].luma4x4[i] = 0;
+                    }
+                    for( x = 0; x < 4; x++ )
+                    {
+                        for( y = 0; y < 4; y++ )
+                        {
+                            dct4x4[idx][x][y] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if( i_decimate_mb < 6 )
+        {
+            for( idx = 0; idx < 16; idx++ )
+            {
+                for( i = 0; i < 16; i++ )
+                {
+                    h->dct.block[idx].luma4x4[i] = 0;
+                }
+            }
+        }
+        else
+        {
+            h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 );
+        }
+    }
+
+    /* encode chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( h->mb.i_type ) )
+    {
+        const int i_mode = h->mb.i_chroma_pred_mode;
+        /* do the right prediction */
+        h->predict_8x8[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
+        h->predict_8x8[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
+
+        /* fix the pred mode value */
+        h->mb.i_chroma_pred_mode = x264_mb_pred_mode8x8_fix[i_mode];
+    }
+
+    /* encode the 8x8 blocks */
+    x264_mb_encode_8x8( h, !IS_INTRA( h->mb.i_type ), i_qscale );
+
+    /* Calculate the Luma/Chroma patern and non_zero_count */
+    if( h->mb.i_type == I_16x16 )
+    {
+        h->mb.i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
+            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+            if( nz > 0 )
+            {
+                h->mb.i_cbp_luma = 0x0f;
+            }
+        }
+    }
+    else
+    {
+        h->mb.i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
+            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+            if( nz > 0 )
+            {
+                h->mb.i_cbp_luma |= 1 << (i/4);
+            }
+        }
+    }
+
+    /* Calculate the chroma patern */
+    h->mb.i_cbp_chroma = 0x00;
+    for( i = 0; i < 8; i++ )
+    {
+        const int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
+        h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
+        if( nz > 0 )
+        {
+            h->mb.i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
+        }
+    }
+    if( h->mb.i_cbp_chroma == 0x00 &&
+        ( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 || array_non_zero_count( h->dct.chroma_dc[1], 4 ) ) > 0 )
+    {
+        h->mb.i_cbp_chroma = 0x01;    /* dc only */
+    }
+
+    if( h->param.b_cabac )
+    {
+        if( h->mb.i_type == I_16x16 && array_non_zero_count( h->dct.luma16x16_dc, 16 ) > 0 )
+            i_cbp_dc = 0x01;
+        else
+            i_cbp_dc = 0x00;
+
+        if( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 )
+            i_cbp_dc |= 0x02;
+        if( array_non_zero_count( h->dct.chroma_dc[1], 4 ) > 0 )
+            i_cbp_dc |= 0x04;
+    }
+
+    /* store cbp */
+    h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
+
+    /* Check for P_SKIP
+     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
+     *      (if multiple mv give same result)*/
+    if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
+        h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
+        h->mb.qp[h->mb.i_mb_xy] == h->mb.i_last_qp )
+    {
+        if( h->mb.cache.ref[0][x264_scan8[0]] == 0 )
+        {
+            int mvp[2];
+
+            x264_mb_predict_mv_pskip( h, mvp );
+            if( h->mb.cache.mv[0][x264_scan8[0]][0] == mvp[0] &&
+                h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
+            {
+                h->mb.type[h->mb.i_mb_xy] = h->mb.i_type = P_SKIP;
+            }
+        }
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_probe_pskip:
+ *  Check if the current MB could be encoded as a P_SKIP (it supposes you use
+ *  the previous QP
+ *****************************************************************************/
+int x264_macroblock_probe_pskip( x264_t *h )
+{
+    DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
+    DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
+    DECLARE_ALIGNED( int,     dctscan[16], 16 );
+
+    int i_qp;
+    int mvp[2];
+    int ch;
+    int n;
+
+    int i8x8, i4x4;
+    int i_decimate_mb;
+
+    /* quantification scale */
+    i_qp = h->mb.qp[h->mb.i_mb_xy];
+
+    /* Get the MV */
+    x264_mb_predict_mv_pskip( h, mvp );
+
+    /* Special case, need to clip the vector */
+    n = 16 * h->mb.i_mb_x + mvp[0];
+    if( n < -24 )
+        mvp[0] = -24 - 16*h->mb.i_mb_x;
+    else if( n > 16 * h->sps->i_mb_width + 24 )
+        mvp[0] = 16 * ( h->sps->i_mb_width - h->mb.i_mb_x ) + 24;
+
+    n = 16 * h->mb.i_mb_y + mvp[1];
+    if( n < -24 )
+        mvp[1] = -24 - 16*h->mb.i_mb_y;
+    else if( n > 16 * h->sps->i_mb_height + 8 )
+        mvp[1] = 16 * ( h->sps->i_mb_height - h->mb.i_mb_y ) + 8;
+
+
+    /* Motion compensation */
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
+                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
+                    mvp[0], mvp[1], 16, 16 );
+
+    /* get luma diff */
+    h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                                  h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+    for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
+    {
+        /* encode one 4x4 block */
+        for( i4x4 = 0; i4x4 < 4; i4x4++ )
+        {
+            const int idx = i8x8 * 4 + i4x4;
+
+            quant_4x4( dct4x4[idx], i_qp, 0 );
+            scan_zigzag_4x4full( dctscan, dct4x4[idx] );
+
+            i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
+
+            if( i_decimate_mb >= 6 )
+            {
+                /* not as P_SKIP */
+                return 0;
+            }
+        }
+    }
+
+    /* encode chroma */
+    i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        const int i_stride = h->mb.pic.i_stride[1+ch];
+        uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
+        uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
+
+        h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][1+ch], i_stride,
+                          h->mb.pic.p_fdec[1+ch],       i_stride,
+                          mvp[0], mvp[1], 8, 8 );
+
+        h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+
+        /* calculate dct DC */
+        dct2x2[0][0] = dct4x4[0][0][0];
+        dct2x2[0][1] = dct4x4[1][0][0];
+        dct2x2[1][0] = dct4x4[2][0][0];
+        dct2x2[1][1] = dct4x4[3][0][0];
+        h->dctf.dct2x2dc( dct2x2 );
+        quant_2x2_dc( dct2x2, i_qp, 0 );
+        if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
+        {
+            /* can't be */
+            return 0;
+        }
+
+        /* calculate dct coeffs */
+        for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
+        {
+            quant_4x4( dct4x4[i4x4], i_qp, 0 );
+            scan_zigzag_4x4( dctscan, dct4x4[i4x4] );
+
+            i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
+            if( i_decimate_mb >= 7 )
+            {
+                return 0;
+            }
+        }
+    }
+
+    return 1;
+}
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
new file mode 100644
index 00000000..b030755c
--- /dev/null
+++ b/encoder/macroblock.h
@@ -0,0 +1,38 @@
+/*****************************************************************************
+ * macroblock.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ENCODER_MACROBLOCK_H
+#define _ENCODER_MACROBLOCK_H 1
+
+#include "../core/macroblock.h"
+
+int x264_macroblock_probe_pskip( x264_t *h );
+
+void x264_macroblock_encode      ( x264_t *h );
+void x264_macroblock_write_cabac ( x264_t *h, bs_t *s );
+void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
+
+void x264_cabac_mb_skip( x264_t *h, int b_skip );
+
+#endif
+
diff --git a/encoder/me.c b/encoder/me.c
new file mode 100644
index 00000000..b1653754
--- /dev/null
+++ b/encoder/me.c
@@ -0,0 +1,194 @@
+/*****************************************************************************
+ * me.c: h264 encoder library (Motion Estimation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: me.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "me.h"
+
+void x264_me_search( x264_t *h, x264_me_t *m )
+{
+    const int i_pixel = m->i_pixel;
+    int bcost;
+    int bmx, bmy;
+    uint8_t *p_fref = m->p_fref;
+    int i_iter;
+
+
+    /* init with mvp */
+    /* XXX: We don't need to clamp because the way diamond work, we will
+     * never go outside padded picture, and predict mv won't compute vector
+     * with componant magnitude greater.
+     * XXX: if some vector can go outside, (accelerator, ....) you need to clip
+     * them yourself */
+    bmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+    bmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+
+    p_fref = &m->p_fref[bmy * m->i_stride + bmx];
+    bcost = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride, p_fref, m->i_stride );
+
+
+    /* try a candidate if provided */
+    if( m->b_mvc )
+    {
+        const int mx = x264_clip3( ( m->mvc[0] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+        const int my = x264_clip3( ( m->mvc[1] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+        uint8_t *p_fref2 = &m->p_fref[my*m->i_stride+mx];
+        int cost = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride, p_fref2, m->i_stride ) +
+                   m->lm * ( bs_size_se( m->mvc[0] - m->mvp[0] ) + bs_size_se( m->mvc[1] - m->mvp[1] ) );
+        if( cost < bcost )
+        {
+            bmx = mx;
+            bmy = my;
+            bcost = cost;
+            p_fref = p_fref2;
+        }
+    }
+
+    /* Don't need to test mv_range each time, we won't go outside picture+padding */
+    /* diamond */
+    for( i_iter = 0; i_iter < 16; i_iter++ )
+    {
+        int best = 0;
+        int cost[4];
+
+#define COST_MV( c, dx, dy ) \
+        (c) = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride,                    \
+                               &p_fref[(dy)*m->i_stride+(dx)], m->i_stride ) + \
+              m->lm * ( bs_size_se(((bmx+(dx))<<2) - m->mvp[0] ) +         \
+                        bs_size_se(((bmy+(dy))<<2) - m->mvp[1] ) )
+
+        COST_MV( cost[0],  0, -1 );
+        COST_MV( cost[1],  0,  1 );
+        COST_MV( cost[2], -1,  0 );
+        COST_MV( cost[3],  1,  0 );
+#undef COST_MV
+
+        if( cost[1] < cost[0] )    best = 1;
+        if( cost[2] < cost[best] ) best = 2;
+        if( cost[3] < cost[best] ) best = 3;
+
+        if( bcost <= cost[best] )
+            break;
+
+        bcost = cost[best];
+
+        if( best == 0 ) {
+            bmy--;
+            p_fref -= m->i_stride;
+        } else if( best == 1 ) {
+            bmy++;
+            p_fref += m->i_stride;
+        } else if( best == 2 ) {
+            bmx--;
+            p_fref--;
+        } else if( best == 3 ) {
+            bmx++;
+            p_fref++;
+        }
+    }
+
+    /* -> qpel mv */
+    m->mv[0] = bmx << 2;
+    m->mv[1] = bmy << 2;
+
+    /* compute the real cost */
+    m->cost = h->pixf.satd[i_pixel]( m->p_fenc, m->i_stride, p_fref, m->i_stride ) +
+                m->lm * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
+                          bs_size_se( m->mv[1] - m->mvp[1] ) );
+}
+
+void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
+{
+    const int bw = x264_pixel_size[m->i_pixel].w;
+    const int bh = x264_pixel_size[m->i_pixel].h;
+
+    DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
+    int cost[4];
+    int best;
+
+    int bmx = m->mv[0];
+    int bmy = m->mv[1];
+
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 2, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 2, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 2, bmy + 0, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 2, bmy + 0, bw, bh );
+
+    cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 2 - m->mvp[1] ) );
+    cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 2 - m->mvp[1] ) );
+    cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
+              m->lm * ( bs_size_se( bmx - 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+    cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
+              m->lm * ( bs_size_se( bmx + 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+
+    best = 0;
+    if( cost[1] < cost[0] )    best = 1;
+    if( cost[2] < cost[best] ) best = 2;
+    if( cost[3] < cost[best] ) best = 3;
+
+    if( cost[best] < m->cost )
+    {
+        m->cost = cost[best];
+        if( best == 0 )      bmy -= 2;
+        else if( best == 1 ) bmy += 2;
+        else if( best == 2 ) bmx -= 2;
+        else if( best == 3 ) bmx += 2;
+    }
+
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 1, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 1, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 1, bmy + 0, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 1, bmy + 0, bw, bh );
+
+    cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 1 - m->mvp[1] ) );
+    cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 1 - m->mvp[1] ) );
+    cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
+              m->lm * ( bs_size_se( bmx - 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+    cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
+              m->lm * ( bs_size_se( bmx + 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+
+    best = 0;
+    if( cost[1] < cost[0] )    best = 1;
+    if( cost[2] < cost[best] ) best = 2;
+    if( cost[3] < cost[best] ) best = 3;
+
+    if( cost[best] < m->cost )
+    {
+        m->cost = cost[best];
+        if( best == 0 )      bmy--;
+        else if( best == 1 ) bmy++;
+        else if( best == 2 ) bmx--;
+        else if( best == 3 ) bmx++;
+    }
+
+    m->mv[0] = bmx;
+    m->mv[1] = bmy;
+}
diff --git a/encoder/me.h b/encoder/me.h
new file mode 100644
index 00000000..bc639a3e
--- /dev/null
+++ b/encoder/me.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * me.h: h264 encoder library (Motion Estimation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: me.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ME_H
+#define _ME_H 1
+
+typedef struct
+{
+    /* input */
+    int      i_pixel;   /* PIXEL_WxH */
+    int      lm;        /* lambda motion */
+
+    uint8_t *p_fref;
+    uint8_t *p_fenc;
+    int      i_stride;
+
+    int i_mv_range;
+
+    int mvp[2];
+
+    int b_mvc;
+    int mvc[2];
+
+    /* output */
+    int cost;           /* satd + lm * nbits */
+    int mv[2];
+} x264_me_t;
+
+void x264_me_search( x264_t *h, x264_me_t *m );
+void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
+
+#endif
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
new file mode 100644
index 00000000..56a5af9b
--- /dev/null
+++ b/encoder/ratecontrol.c
@@ -0,0 +1,96 @@
+/*****************************************************************************
+ * ratecontrol.c: h264 encoder library (Rate Control)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ratecontrol.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "ratecontrol.h"
+
+
+x264_ratecontrol_t *x264_ratecontrol_new( x264_param_t *param )
+{
+    x264_ratecontrol_t *rc = x264_malloc( sizeof( x264_ratecontrol_t ) );
+
+    rc->fps = param->f_fps > 0.1 ? param->f_fps : 25.0f;
+    rc->i_iframe = param->i_iframe;
+    rc->i_bitrate = param->i_bitrate * 1000;
+
+    rc->i_qp_last = 26;
+    rc->i_qp      = param->i_qp_constant;
+
+    rc->i_frames  = 0;
+    rc->i_size    = 0;
+
+    return rc;
+}
+
+void x264_ratecontrol_delete( x264_ratecontrol_t *rc )
+{
+    x264_free( rc );
+}
+
+void x264_ratecontrol_start( x264_ratecontrol_t *rc, int i_slice_type )
+{
+    rc->i_slice_type = i_slice_type;
+}
+
+int  x264_ratecontrol_qp( x264_ratecontrol_t *rc )
+{
+    return x264_clip3( rc->i_qp, 1, 51 );
+}
+
+void x264_ratecontrol_end( x264_ratecontrol_t *rc, int bits )
+{
+    return;
+#if 0
+    int i_avg;
+    int i_target = rc->i_bitrate / rc->fps;
+    int i_qp = rc->i_qp;
+
+    rc->i_qp_last = rc->i_qp;
+    rc->i_frames++;
+    rc->i_size += bits / 8;
+
+    i_avg = 8 * rc->i_size / rc->i_frames;
+
+    if( rc->i_slice_type == SLICE_TYPE_I )
+    {
+        i_target = i_target * 20 / 10;
+    }
+
+    if( i_avg > i_target * 11 / 10 )
+    {
+        i_qp = rc->i_qp + ( i_avg / i_target - 1 );
+    }
+    else if( i_avg < i_target * 9 / 10 )
+    {
+        i_qp = rc->i_qp - ( i_target / i_avg - 1 );
+    }
+
+    rc->i_qp = x264_clip3( i_qp, rc->i_qp_last - 2, rc->i_qp_last + 2 );
+#endif
+}
+
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
new file mode 100644
index 00000000..5fa3c7c9
--- /dev/null
+++ b/encoder/ratecontrol.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * ratecontrol.h: h264 encoder library (Rate Control)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ratecontrol.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _RATECONTROL_H
+#define _RATECONTROL_H 1
+
+struct x264_ratecontrol_t
+{
+    float fps;
+    int   i_iframe;
+
+    int i_bitrate;
+    int i_qp_last;
+    int i_qp;
+
+    int i_slice_type;
+
+    int     i_frames;
+    int64_t i_size;
+
+};
+
+
+x264_ratecontrol_t *x264_ratecontrol_new   ( x264_param_t * );
+void                x264_ratecontrol_delete( x264_ratecontrol_t * );
+
+void x264_ratecontrol_start( x264_ratecontrol_t *, int i_slice_type );
+int  x264_ratecontrol_qp( x264_ratecontrol_t * );
+void x264_ratecontrol_end( x264_ratecontrol_t *, int bits );
+
+#endif
+
diff --git a/encoder/set.c b/encoder/set.c
new file mode 100644
index 00000000..56284642
--- /dev/null
+++ b/encoder/set.c
@@ -0,0 +1,382 @@
+/*****************************************************************************
+ * set: h264 encoder (SPS and SPS init and write)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "../core/bs.h"
+#include "../core/set.h"
+
+void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
+{
+    sps->i_id               = i_id;
+
+    if( param->b_cabac || param->i_bframe > 0 )
+        sps->i_profile_idc      = PROFILE_MAIN;
+    else
+        sps->i_profile_idc      = PROFILE_BASELINE;
+
+    sps->i_level_idc        = 21;               /* FIXME ? */
+    sps->b_constraint_set0  = 0;
+    sps->b_constraint_set1  = 0;
+    sps->b_constraint_set2  = 0;
+
+    sps->i_log2_max_frame_num = 4;  /* at least 4 */
+    while( (1 << sps->i_log2_max_frame_num) <= param->i_idrframe * param->i_iframe )
+    {
+        sps->i_log2_max_frame_num++;
+    }
+    sps->i_log2_max_frame_num++;    /* just in case */
+
+    sps->i_poc_type = 0;
+    if( sps->i_poc_type == 0 )
+    {
+        sps->i_log2_max_poc_lsb = sps->i_log2_max_frame_num + 1;    /* max poc = 2*frame_num */
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+
+        /* FIXME */
+        sps->b_delta_pic_order_always_zero = 1;
+        sps->i_offset_for_non_ref_pic = 0;
+        sps->i_offset_for_top_to_bottom_field = 0;
+        sps->i_num_ref_frames_in_poc_cycle = 0;
+
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            sps->i_offset_for_ref_frame[i] = 0;
+        }
+    }
+
+    sps->i_num_ref_frames = param->i_frame_reference + 1; /* +1 for 2 ref in B */
+    sps->b_gaps_in_frame_num_value_allowed = 0;
+    sps->i_mb_width = ( param->i_width + 15 ) / 16;
+    sps->i_mb_height= ( param->i_height + 15 )/ 16;
+    sps->b_frame_mbs_only = 1;
+    sps->b_mb_adaptive_frame_field = 0;
+    sps->b_direct8x8_inference = 0;
+    if( sps->b_frame_mbs_only == 0 )
+    {
+        sps->b_direct8x8_inference = 1;
+    }
+
+    if( param->i_width % 16 != 0 || param->i_height % 16 != 0 )
+    {
+        sps->b_crop = 1;
+        sps->crop.i_left    = 0;
+        sps->crop.i_right   = ( 16 - param->i_width % 16)/2;
+        sps->crop.i_top     = 0;
+        sps->crop.i_bottom  = ( 16 - param->i_height % 16)/2;
+    }
+    else
+    {
+        sps->b_crop = 0;
+        sps->crop.i_left    = 0;
+        sps->crop.i_right   = 0;
+        sps->crop.i_top     = 0;
+        sps->crop.i_bottom  = 0;
+    }
+
+    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
+    {
+        int w = param->vui.i_sar_width;
+        int h = param->vui.i_sar_height;
+        int a = w, b = h;
+
+        while( b != 0 )
+        {
+            int t = a;
+
+            a = b;
+            b = t % b;
+        }
+
+        w /= a;
+        h /= a;
+        while( w > 65535 || h > 65535 )
+        {
+            w /= 2;
+            h /= 2;
+        }
+
+        if( w == 0 || h == 0 )
+        {
+            fprintf( stderr, "x264: cannot create valid sample aspect ratio\n" );
+            sps->b_vui = 0;
+        }
+        else if( w == h )
+        {
+            fprintf( stderr, "x264: no need for a SAR\n" );
+            sps->b_vui = 0;
+        }
+        else
+        {
+            fprintf( stderr, "x264: using SAR=%d/%d\n", w, h );
+            sps->b_vui = 1;
+            sps->vui.i_sar_width = w;
+            sps->vui.i_sar_height= h;
+        }
+    }
+    else
+    {
+        sps->b_vui = 0;
+    }
+}
+
+
+void x264_sps_write( bs_t *s, x264_sps_t *sps )
+{
+    bs_write( s, 8, sps->i_profile_idc );
+    bs_write( s, 1, sps->b_constraint_set0 );
+    bs_write( s, 1, sps->b_constraint_set1 );
+    bs_write( s, 1, sps->b_constraint_set2 );
+
+    bs_write( s, 5, 0 );    /* reserved */
+
+    bs_write( s, 8, sps->i_level_idc );
+
+    bs_write_ue( s, sps->i_id );
+    bs_write_ue( s, sps->i_log2_max_frame_num - 4 );
+    bs_write_ue( s, sps->i_poc_type );
+    if( sps->i_poc_type == 0 )
+    {
+        bs_write_ue( s, sps->i_log2_max_poc_lsb - 4 );
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+
+        bs_write( s, 1, sps->b_delta_pic_order_always_zero );
+        bs_write_se( s, sps->i_offset_for_non_ref_pic );
+        bs_write_se( s, sps->i_offset_for_top_to_bottom_field );
+        bs_write_ue( s, sps->i_num_ref_frames_in_poc_cycle );
+
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            bs_write_se( s, sps->i_offset_for_ref_frame[i] );
+        }
+    }
+    bs_write_ue( s, sps->i_num_ref_frames );
+    bs_write( s, 1, sps->b_gaps_in_frame_num_value_allowed );
+    bs_write_ue( s, sps->i_mb_width - 1 );
+    bs_write_ue( s, sps->i_mb_height - 1);
+    bs_write( s, 1, sps->b_frame_mbs_only );
+    if( !sps->b_frame_mbs_only )
+    {
+        bs_write( s, 1, sps->b_mb_adaptive_frame_field );
+    }
+    bs_write( s, 1, sps->b_direct8x8_inference );
+
+    bs_write( s, 1, sps->b_crop );
+    if( sps->b_crop )
+    {
+        bs_write_ue( s, sps->crop.i_left );
+        bs_write_ue( s, sps->crop.i_right );
+        bs_write_ue( s, sps->crop.i_top );
+        bs_write_ue( s, sps->crop.i_bottom );
+    }
+
+    bs_write( s, 1, sps->b_vui );
+    if( sps->b_vui )
+    {
+        int i;
+        static const struct { int w, h; int sar; } sar[] =
+        {
+            { 1,   1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
+            { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
+            { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
+            { 160,99, 13}, { 0, 0, -1 }
+        };
+        bs_write1( s, 1 );      /* aspect_ratio_info_present_flag */
+        for( i = 0; sar[i].sar != -1; i++ )
+        {
+            if( sar[i].w == sps->vui.i_sar_width && sar[i].h == sps->vui.i_sar_height )
+                break;
+        }
+        if( sar[i].sar != -1 )
+        {
+            bs_write( s, 8, sar[i].sar );
+        }
+        else
+        {
+            bs_write( s, 8, 255);   /* aspect_ration_idc (extented) */
+            bs_write( s, 16, sps->vui.i_sar_width );
+            bs_write( s, 16, sps->vui.i_sar_height );
+        }
+
+        bs_write1( s, 0 );      /* overscan_info_present_flag */
+
+        bs_write1( s, 0 );      /* video_signal_type_present_flag */
+#if 0
+        bs_write( s, 3, 5 );    /* unspecified video format */
+        bs_write1( s, 1 );      /* video full range flag */
+        bs_write1( s, 0 );      /* colour description present flag */
+#endif
+        bs_write1( s, 0 );      /* chroma_loc_info_present_flag */
+        bs_write1( s, 0 );      /* timing_info_present_flag */
+        bs_write1( s, 0 );      /* nal_hrd_parameters_present_flag */
+        bs_write1( s, 0 );      /* vcl_hrd_parameters_present_flag */
+        bs_write1( s, 0 );      /* pic_struct_present_flag */
+        bs_write1( s, 0 );      /* bitstream_restriction_flag */
+    }
+
+    bs_rbsp_trailing( s );
+}
+
+void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
+{
+    pps->i_id = i_id;
+    pps->i_sps_id = sps->i_id;
+    pps->b_cabac = param->b_cabac;
+
+    pps->b_pic_order = 0;
+    pps->i_num_slice_groups = 1;
+
+    if( pps->i_num_slice_groups > 1 )
+    {
+        int i;
+
+        pps->i_slice_group_map_type = 0;
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_run_length[i] = 1;
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_top_left[i] = 0;
+                pps->i_bottom_right[i] = 0;
+            }
+        }
+        else if( pps->i_slice_group_map_type >= 3 &&
+                 pps->i_slice_group_map_type <= 5 )
+        {
+            pps->b_slice_group_change_direction = 0;
+            pps->i_slice_group_change_rate = 0;
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            pps->i_pic_size_in_map_units = 1;
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+                pps->i_slice_group_id[i] = 0;
+            }
+        }
+    }
+    pps->i_num_ref_idx_l0_active = 1;
+    pps->i_num_ref_idx_l1_active = 1;
+
+    pps->b_weighted_pred = 0;
+    pps->b_weighted_bipred = 0;
+
+    pps->i_pic_init_qp = 26;
+    pps->i_pic_init_qs = 26;
+
+    pps->i_chroma_qp_index_offset = 0;
+#if 0
+    if( !param->b_deblocking_filter )
+    {
+        pps->b_deblocking_filter_control = 1;
+    }
+    else
+    {
+        pps->b_deblocking_filter_control = 1;
+    }
+#endif
+    pps->b_deblocking_filter_control = 1;
+    pps->b_constrained_intra_pred = 0;
+    pps->b_redundant_pic_cnt = 0;
+}
+
+void x264_pps_write( bs_t *s, x264_pps_t *pps )
+{
+    bs_write_ue( s, pps->i_id );
+    bs_write_ue( s, pps->i_sps_id );
+
+    bs_write( s, 1, pps->b_cabac );
+    bs_write( s, 1, pps->b_pic_order );
+    bs_write_ue( s, pps->i_num_slice_groups - 1 );
+
+    if( pps->i_num_slice_groups > 1 )
+    {
+        int i;
+
+        bs_write_ue( s, pps->i_slice_group_map_type );
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                bs_write_ue( s, pps->i_run_length[i] - 1 );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                bs_write_ue( s, pps->i_top_left[i] );
+                bs_write_ue( s, pps->i_bottom_right[i] );
+            }
+        }
+        else if( pps->i_slice_group_map_type >= 3 &&
+                 pps->i_slice_group_map_type <= 5 )
+        {
+            bs_write( s, 1, pps->b_slice_group_change_direction );
+            bs_write_ue( s, pps->b_slice_group_change_direction - 1 );
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            bs_write_ue( s, pps->i_pic_size_in_map_units - 1 );
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+                /* FIXME */
+                /* bs_write( s, ceil( log2( pps->i_pic_size_in_map_units +1 ) ),
+                 *              pps->i_slice_group_id[i] );
+                 */
+            }
+        }
+    }
+
+    bs_write_ue( s, pps->i_num_ref_idx_l0_active - 1 );
+    bs_write_ue( s, pps->i_num_ref_idx_l1_active - 1 );
+    bs_write( s, 1, pps->b_weighted_pred );
+    bs_write( s, 2, pps->b_weighted_bipred );
+
+    bs_write_se( s, pps->i_pic_init_qp - 26 );
+    bs_write_se( s, pps->i_pic_init_qs - 26 );
+    bs_write_se( s, pps->i_chroma_qp_index_offset );
+
+    bs_write( s, 1, pps->b_deblocking_filter_control );
+    bs_write( s, 1, pps->b_constrained_intra_pred );
+    bs_write( s, 1, pps->b_redundant_pic_cnt );
+
+    bs_rbsp_trailing( s );
+}
+
diff --git a/encoder/set.h b/encoder/set.h
new file mode 100644
index 00000000..44cae088
--- /dev/null
+++ b/encoder/set.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * set.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ENCODER_SET_H
+#define _ENCODER_SET_H 1
+
+void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
+void x264_sps_write( bs_t *s, x264_sps_t *sps );
+void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
+void x264_pps_write( bs_t *s, x264_pps_t *pps );
+
+#endif
diff --git a/extras/getopt.c b/extras/getopt.c
new file mode 100644
index 00000000..3b5e196b
--- /dev/null
+++ b/extras/getopt.c
@@ -0,0 +1,503 @@
+/*	$NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $	*/
+
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#define REPLACE_GETOPT
+
+#define _DIAGASSERT(x) do {} while (0)
+
+#ifdef REPLACE_GETOPT
+#ifdef __weak_alias
+__weak_alias(getopt,_getopt)
+#endif
+int opterr = 1;	/* if error message should be printed */
+int optind = 1;	/* index into parent argv vector */
+int optopt = '?';	/* character checked for validity */
+int optreset;	/* reset getopt */
+char *optarg;	/* argument associated with option */
+#endif
+
+#ifdef __weak_alias
+__weak_alias(getopt_long,_getopt_long)
+#endif
+
+#ifndef __CYGWIN__
+#define __progname __argv[0]
+#else
+extern char *__progname;
+#endif
+
+#define IGNORE_FIRST	(*options == '-' || *options == '+')
+#define PRINT_ERROR	((opterr) && ((*options != ':') \
+				      || (IGNORE_FIRST && options[1] != ':')))
+
+#define IS_POSIXLY_CORRECT (getenv("POSIXLY_INCORRECT_GETOPT") == NULL)
+
+#define PERMUTE         (!IS_POSIXLY_CORRECT && !IGNORE_FIRST)
+/* XXX: GNU ignores PC if *options == '-' */
+#define IN_ORDER        (!IS_POSIXLY_CORRECT && *options == '-')
+
+/* return values */
+#define	BADCH	(int)'?'
+#define	BADARG		((IGNORE_FIRST && options[1] == ':') \
+			 || (*options == ':') ? (int)':' : (int)'?')
+#define INORDER (int)1
+
+static char EMSG[1];
+
+static int getopt_internal (int, char * const *, const char *);
+static int gcd (int, int);
+static void permute_args (int, int, int, char * const *);
+
+static char *place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1;   /* first option after non options (for permute) */
+
+/* Error messages */
+static const char recargchar[] = "option requires an argument -- %c";
+static const char recargstring[] = "option requires an argument -- %s";
+static const char ambig[] = "ambiguous option -- %.*s";
+static const char noarg[] = "option doesn't take an argument -- %.*s";
+static const char illoptchar[] = "unknown option -- %c";
+static const char illoptstring[] = "unknown option -- %s";
+
+static void
+_vwarnx(const char *fmt, va_list ap)
+{
+  (void)fprintf(stderr, "%s: ", __progname);
+  if (fmt != NULL)
+    (void)vfprintf(stderr, fmt, ap);
+  (void)fprintf(stderr, "\n");
+}
+
+static void
+warnx(const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  _vwarnx(fmt, ap);
+  va_end(ap);
+}
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int
+gcd(a, b)
+	int a;
+	int b;
+{
+	int c;
+
+	c = a % b;
+	while (c != 0) {
+		a = b;
+		b = c;
+		c = a % b;
+	}
+
+	return b;
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void
+permute_args(panonopt_start, panonopt_end, opt_end, nargv)
+	int panonopt_start;
+	int panonopt_end;
+	int opt_end;
+	char * const *nargv;
+{
+	int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+	char *swap;
+
+	_DIAGASSERT(nargv != NULL);
+
+	/*
+	 * compute lengths of blocks and number and size of cycles
+	 */
+	nnonopts = panonopt_end - panonopt_start;
+	nopts = opt_end - panonopt_end;
+	ncycle = gcd(nnonopts, nopts);
+	cyclelen = (opt_end - panonopt_start) / ncycle;
+
+	for (i = 0; i < ncycle; i++) {
+		cstart = panonopt_end+i;
+		pos = cstart;
+		for (j = 0; j < cyclelen; j++) {
+			if (pos >= panonopt_end)
+				pos -= nnonopts;
+			else
+				pos += nopts;
+			swap = nargv[pos];
+			/* LINTED const cast */
+			((char **) nargv)[pos] = nargv[cstart];
+			/* LINTED const cast */
+			((char **)nargv)[cstart] = swap;
+		}
+	}
+}
+
+/*
+ * getopt_internal --
+ *	Parse argc/argv argument vector.  Called by user level routines.
+ *  Returns -2 if -- is found (can be long option or end of options marker).
+ */
+static int
+getopt_internal(nargc, nargv, options)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+{
+	char *oli;				/* option letter list index */
+	int optchar;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+
+	optarg = NULL;
+
+	/*
+	 * XXX Some programs (like rsyncd) expect to be able to
+	 * XXX re-initialize optind to 0 and have getopt_long(3)
+	 * XXX properly function again.  Work around this braindamage.
+	 */
+	if (optind == 0)
+		optind = 1;
+
+	if (optreset)
+		nonopt_start = nonopt_end = -1;
+start:
+	if (optreset || !*place) {		/* update scanning pointer */
+		optreset = 0;
+		if (optind >= nargc) {          /* end of argument vector */
+			place = EMSG;
+			if (nonopt_end != -1) {
+				/* do permutation, if we have to */
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			else if (nonopt_start != -1) {
+				/*
+				 * If we skipped non-options, set optind
+				 * to the first of them.
+				 */
+				optind = nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return -1;
+		}
+		if ((*(place = nargv[optind]) != '-')
+		    || (place[1] == '\0')) {    /* found non-option */
+			place = EMSG;
+			if (IN_ORDER) {
+				/*
+				 * GNU extension:
+				 * return non-option as argument to option 1
+				 */
+				optarg = nargv[optind++];
+				return INORDER;
+			}
+			if (!PERMUTE) {
+				/*
+				 * if no permutation wanted, stop parsing
+				 * at first non-option
+				 */
+				return -1;
+			}
+			/* do permutation */
+			if (nonopt_start == -1)
+				nonopt_start = optind;
+			else if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				nonopt_start = optind -
+				    (nonopt_end - nonopt_start);
+				nonopt_end = -1;
+			}
+			optind++;
+			/* process next argument */
+			goto start;
+		}
+		if (nonopt_start != -1 && nonopt_end == -1)
+			nonopt_end = optind;
+		if (place[1] && *++place == '-') {	/* found "--" */
+			place++;
+			return -2;
+		}
+	}
+	if ((optchar = (int)*place++) == (int)':' ||
+	    (oli = strchr(options + (IGNORE_FIRST ? 1 : 0), optchar)) == NULL) {
+		/* option letter unknown or ':' */
+		if (!*place)
+			++optind;
+		if (PRINT_ERROR)
+			warnx(illoptchar, optchar);
+		optopt = optchar;
+		return BADCH;
+	}
+	if (optchar == 'W' && oli[1] == ';') {		/* -W long-option */
+		/* XXX: what if no long options provided (called by getopt)? */
+		if (*place)
+			return -2;
+
+		if (++optind >= nargc) {	/* no arg */
+			place = EMSG;
+			if (PRINT_ERROR)
+				warnx(recargchar, optchar);
+			optopt = optchar;
+			return BADARG;
+		} else				/* white space */
+			place = nargv[optind];
+		/*
+		 * Handle -W arg the same as --arg (which causes getopt to
+		 * stop parsing).
+		 */
+		return -2;
+	}
+	if (*++oli != ':') {			/* doesn't take argument */
+		if (!*place)
+			++optind;
+	} else {				/* takes (optional) argument */
+		optarg = NULL;
+		if (*place)			/* no white space */
+			optarg = place;
+		/* XXX: disable test for :: if PC? (GNU doesn't) */
+		else if (oli[1] != ':') {	/* arg not optional */
+			if (++optind >= nargc) {	/* no arg */
+				place = EMSG;
+				if (PRINT_ERROR)
+					warnx(recargchar, optchar);
+				optopt = optchar;
+				return BADARG;
+			} else
+				optarg = nargv[optind];
+		}
+		place = EMSG;
+		++optind;
+	}
+	/* dump back option letter */
+	return optchar;
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the real getopt]
+ */
+int
+getopt(nargc, nargv, options)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+{
+	int retval;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+
+	if ((retval = getopt_internal(nargc, nargv, options)) == -2) {
+		++optind;
+		/*
+		 * We found an option (--), so if we skipped non-options,
+		 * we have to permute.
+		 */
+		if (nonopt_end != -1) {
+			permute_args(nonopt_start, nonopt_end, optind,
+				       nargv);
+			optind -= nonopt_end - nonopt_start;
+		}
+		nonopt_start = nonopt_end = -1;
+		retval = -1;
+	}
+	return retval;
+}
+#endif
+
+/*
+ * getopt_long --
+ *	Parse argc/argv argument vector.
+ */
+int
+getopt_long(nargc, nargv, options, long_options, idx)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+	const struct option *long_options;
+	int *idx;
+{
+	int retval;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+	_DIAGASSERT(long_options != NULL);
+	/* idx may be NULL */
+
+	if ((retval = getopt_internal(nargc, nargv, options)) == -2) {
+		char *current_argv, *has_equal;
+		size_t current_argv_len;
+		int i, match;
+
+		current_argv = place;
+		match = -1;
+
+		optind++;
+		place = EMSG;
+
+		if (*current_argv == '\0') {		/* found "--" */
+			/*
+			 * We found an option (--), so if we skipped
+			 * non-options, we have to permute.
+			 */
+			if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return -1;
+		}
+		if ((has_equal = strchr(current_argv, '=')) != NULL) {
+			/* argument found (--option=arg) */
+			current_argv_len = has_equal - current_argv;
+			has_equal++;
+		} else
+			current_argv_len = strlen(current_argv);
+
+		for (i = 0; long_options[i].name; i++) {
+			/* find matching long option */
+			if (strncmp(current_argv, long_options[i].name,
+			    current_argv_len))
+				continue;
+
+			if (strlen(long_options[i].name) ==
+			    (unsigned)current_argv_len) {
+				/* exact match */
+				match = i;
+				break;
+			}
+			if (match == -1)		/* partial match */
+				match = i;
+			else {
+				/* ambiguous abbreviation */
+				if (PRINT_ERROR)
+					warnx(ambig, (int)current_argv_len,
+					     current_argv);
+				optopt = 0;
+				return BADCH;
+			}
+		}
+		if (match != -1) {			/* option found */
+			if (long_options[match].has_arg == no_argument
+			    && has_equal) {
+				if (PRINT_ERROR)
+					warnx(noarg, (int)current_argv_len,
+					     current_argv);
+				/*
+				 * XXX: GNU sets optopt to val regardless of
+				 * flag
+				 */
+				if (long_options[match].flag == NULL)
+					optopt = long_options[match].val;
+				else
+					optopt = 0;
+				return BADARG;
+			}
+			if (long_options[match].has_arg == required_argument ||
+			    long_options[match].has_arg == optional_argument) {
+				if (has_equal)
+					optarg = has_equal;
+				else if (long_options[match].has_arg ==
+				    required_argument) {
+					/*
+					 * optional argument doesn't use
+					 * next nargv
+					 */
+					optarg = nargv[optind++];
+				}
+			}
+			if ((long_options[match].has_arg == required_argument)
+			    && (optarg == NULL)) {
+				/*
+				 * Missing argument; leading ':'
+				 * indicates no error should be generated
+				 */
+				if (PRINT_ERROR)
+					warnx(recargstring, current_argv);
+				/*
+				 * XXX: GNU sets optopt to val regardless
+				 * of flag
+				 */
+				if (long_options[match].flag == NULL)
+					optopt = long_options[match].val;
+				else
+					optopt = 0;
+				--optind;
+				return BADARG;
+			}
+		} else {			/* unknown option */
+			if (PRINT_ERROR)
+				warnx(illoptstring, current_argv);
+			optopt = 0;
+			return BADCH;
+		}
+		if (long_options[match].flag) {
+			*long_options[match].flag = long_options[match].val;
+			retval = 0;
+		} else
+			retval = long_options[match].val;
+		if (idx)
+			*idx = match;
+	}
+	return retval;
+}
diff --git a/extras/getopt.h b/extras/getopt.h
new file mode 100644
index 00000000..18e10269
--- /dev/null
+++ b/extras/getopt.h
@@ -0,0 +1,179 @@
+/* Declarations for getopt.
+   Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+
+#ifndef _GETOPT_H
+
+#ifndef __need_getopt
+# define _GETOPT_H 1
+#endif
+
+/* If __GNU_LIBRARY__ is not already defined, either we are being used
+   standalone, or this is the first header included in the source file.
+   If we are being used with glibc, we need to include <features.h>, but
+   that does not exist if we are standalone.  So: if __GNU_LIBRARY__ is
+   not defined, include <ctype.h>, which will pull in <features.h> for us
+   if it's from glibc.  (Why ctype.h?  It's guaranteed to exist and it
+   doesn't flood the namespace with stuff the way some other headers do.)  */
+#if !defined __GNU_LIBRARY__
+# include <ctype.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* For communication from `getopt' to the caller.
+   When `getopt' finds an option that takes an argument,
+   the argument value is returned here.
+   Also, when `ordering' is RETURN_IN_ORDER,
+   each non-option ARGV-element is returned here.  */
+
+extern char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+   This is used for communication to and from the caller
+   and for communication between successive calls to `getopt'.
+
+   On entry to `getopt', zero means this is the first call; initialize.
+
+   When `getopt' returns -1, this is the index of the first of the
+   non-option elements that the caller should itself scan.
+
+   Otherwise, `optind' communicates from one call to the next
+   how much of ARGV has been scanned so far.  */
+
+extern int optind;
+
+/* Callers store zero here to inhibit the error message `getopt' prints
+   for unrecognized options.  */
+
+extern int opterr;
+
+/* Set to an option character which was unrecognized.  */
+
+extern int optopt;
+
+#ifndef __need_getopt
+/* Describe the long-named options requested by the application.
+   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
+   of `struct option' terminated by an element containing a name which is
+   zero.
+
+   The field `has_arg' is:
+   no_argument		(or 0) if the option does not take an argument,
+   required_argument	(or 1) if the option requires an argument,
+   optional_argument 	(or 2) if the option takes an optional argument.
+
+   If the field `flag' is not NULL, it points to a variable that is set
+   to the value given in the field `val' when the option is found, but
+   left unchanged if the option is not found.
+
+   To have a long-named option do something other than set an `int' to
+   a compiled-in constant, such as set a value from `optarg', set the
+   option's `flag' field to zero and its `val' field to a nonzero
+   value (the equivalent single-letter option character, if there is
+   one).  For long options that have a zero `flag' field, `getopt'
+   returns the contents of the `val' field.  */
+
+struct option
+{
+# if (defined __STDC__ && __STDC__) || defined __cplusplus
+  const char *name;
+# else
+  char *name;
+# endif
+  /* has_arg can't be an enum because some compilers complain about
+     type mismatches in all the code that assumes it is an int.  */
+  int has_arg;
+  int *flag;
+  int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct option'.  */
+
+# define no_argument		0
+# define required_argument	1
+# define optional_argument	2
+#endif	/* need getopt */
+
+
+/* Get definitions and prototypes for functions to process the
+   arguments in ARGV (ARGC of them, minus the program name) for
+   options given in OPTS.
+
+   Return the option character from OPTS just read.  Return -1 when
+   there are no more options.  For unrecognized options, or options
+   missing arguments, `optopt' is set to the option letter, and '?' is
+   returned.
+
+   The OPTS string is a list of characters which are recognized option
+   letters, optionally followed by colons, specifying that that letter
+   takes an argument, to be placed in `optarg'.
+
+   If a letter in OPTS is followed by two colons, its argument is
+   optional.  This behavior is specific to the GNU `getopt'.
+
+   The argument `--' causes premature termination of argument
+   scanning, explicitly telling `getopt' that there are no more
+   options.
+
+   If OPTS begins with `--', then non-option arguments are treated as
+   arguments to the option '\0'.  This behavior is specific to the GNU
+   `getopt'.  */
+
+#if (defined __STDC__ && __STDC__) || defined __cplusplus
+# ifdef __GNU_LIBRARY__
+/* Many other libraries have conflicting prototypes for getopt, with
+   differences in the consts, in stdlib.h.  To avoid compilation
+   errors, only prototype getopt for the GNU C library.  */
+extern int getopt (int __argc, char *const *__argv, const char *__shortopts);
+# else /* not __GNU_LIBRARY__ */
+extern int getopt ();
+# endif /* __GNU_LIBRARY__ */
+
+# ifndef __need_getopt
+extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts,
+		        const struct option *__longopts, int *__longind);
+extern int getopt_long_only (int __argc, char *const *__argv,
+			     const char *__shortopts,
+		             const struct option *__longopts, int *__longind);
+
+/* Internal only.  Users should not call this directly.  */
+extern int _getopt_internal (int __argc, char *const *__argv,
+			     const char *__shortopts,
+		             const struct option *__longopts, int *__longind,
+			     int __long_only);
+# endif
+#else /* not __STDC__ */
+extern int getopt ();
+# ifndef __need_getopt
+extern int getopt_long ();
+extern int getopt_long_only ();
+
+extern int _getopt_internal ();
+# endif
+#endif /* __STDC__ */
+
+#ifdef	__cplusplus
+}
+#endif
+
+/* Make sure we later can get all the definitions and declarations.  */
+#undef __need_getopt
+
+#endif /* getopt.h */
diff --git a/extras/stdint.h b/extras/stdint.h
new file mode 100644
index 00000000..92dfa884
--- /dev/null
+++ b/extras/stdint.h
@@ -0,0 +1,186 @@
+/* ISO C9x  7.18  Integer types <stdint.h>
+ * Based on ISO/IEC SC22/WG14 9899 Committee draft (SC22 N2794)
+ *
+ *  THIS SOFTWARE IS NOT COPYRIGHTED
+ *
+ *  Contributor: Danny Smith <danny_r_smith_2001@yahoo.co.nz>
+ *
+ *  This source code is offered for use in the public domain. You may
+ *  use, modify or distribute it freely.
+ *
+ *  This code is distributed in the hope that it will be useful but
+ *  WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
+ *  DISCLAIMED. This includes but is not limited to warranties of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ *  Date: 2000-12-02
+ */
+
+
+#ifndef _STDINT_H
+#define _STDINT_H
+#define __need_wint_t
+#define __need_wchar_t
+#include <stddef.h>
+
+/* 7.18.1.1  Exact-width integer types */
+typedef signed char int8_t;
+typedef unsigned char   uint8_t;
+typedef short  int16_t;
+typedef unsigned short  uint16_t;
+typedef int  int32_t;
+typedef unsigned   uint32_t;
+typedef __int64  int64_t;
+typedef unsigned __int64 uint64_t;
+
+/* 7.18.1.2  Minimum-width integer types */
+typedef signed char int_least8_t;
+typedef unsigned char   uint_least8_t;
+typedef short  int_least16_t;
+typedef unsigned short  uint_least16_t;
+typedef int  int_least32_t;
+typedef unsigned   uint_least32_t;
+typedef __int64  int_least64_t;
+typedef unsigned __int64   uint_least64_t;
+
+/*  7.18.1.3  Fastest minimum-width integer types 
+ *  Not actually guaranteed to be fastest for all purposes
+ *  Here we use the exact-width types for 8 and 16-bit ints. 
+ */
+typedef char int_fast8_t;
+typedef unsigned char uint_fast8_t;
+typedef short  int_fast16_t;
+typedef unsigned short  uint_fast16_t;
+typedef int  int_fast32_t;
+typedef unsigned  int  uint_fast32_t;
+typedef __int64  int_fast64_t;
+typedef unsigned __int64   uint_fast64_t;
+
+/* 7.18.1.4  Integer types capable of holding object pointers */
+/*typedef int intptr_t;
+typedef unsigned uintptr_t;*/
+
+/* 7.18.1.5  Greatest-width integer types */
+typedef __int64  intmax_t;
+typedef unsigned __int64   uintmax_t;
+
+/* 7.18.2  Limits of specified-width integer types */
+#if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS)
+
+/* 7.18.2.1  Limits of exact-width integer types */
+#define INT8_MIN (-128) 
+#define INT16_MIN (-32768)
+#define INT32_MIN (-2147483647 - 1)
+#define INT64_MIN  (-9223372036854775807LL - 1)
+
+#define INT8_MAX 127
+#define INT16_MAX 32767
+#define INT32_MAX 2147483647
+#define INT64_MAX 9223372036854775807LL
+
+#define UINT8_MAX 0xff /* 255U */
+#define UINT16_MAX 0xffff /* 65535U */
+#define UINT32_MAX 0xffffffff  /* 4294967295U */
+#define UINT64_MAX 0xffffffffffffffffULL /* 18446744073709551615ULL */
+
+/* 7.18.2.2  Limits of minimum-width integer types */
+#define INT_LEAST8_MIN INT8_MIN
+#define INT_LEAST16_MIN INT16_MIN
+#define INT_LEAST32_MIN INT32_MIN
+#define INT_LEAST64_MIN INT64_MIN
+
+#define INT_LEAST8_MAX INT8_MAX
+#define INT_LEAST16_MAX INT16_MAX
+#define INT_LEAST32_MAX INT32_MAX
+#define INT_LEAST64_MAX INT64_MAX
+
+#define UINT_LEAST8_MAX UINT8_MAX
+#define UINT_LEAST16_MAX UINT16_MAX
+#define UINT_LEAST32_MAX UINT32_MAX
+#define UINT_LEAST64_MAX UINT64_MAX
+
+/* 7.18.2.3  Limits of fastest minimum-width integer types */
+#define INT_FAST8_MIN INT8_MIN
+#define INT_FAST16_MIN INT16_MIN
+#define INT_FAST32_MIN INT32_MIN
+#define INT_FAST64_MIN INT64_MIN
+
+#define INT_FAST8_MAX INT8_MAX
+#define INT_FAST16_MAX INT16_MAX
+#define INT_FAST32_MAX INT32_MAX
+#define INT_FAST64_MAX INT64_MAX
+
+#define UINT_FAST8_MAX UINT8_MAX
+#define UINT_FAST16_MAX UINT16_MAX
+#define UINT_FAST32_MAX UINT32_MAX
+#define UINT_FAST64_MAX UINT64_MAX
+
+/* 7.18.2.4  Limits of integer types capable of holding
+    object pointers */ 
+#define INTPTR_MIN INT32_MIN
+#define INTPTR_MAX INT32_MAX
+#define UINTPTR_MAX UINT32_MAX
+
+/* 7.18.2.5  Limits of greatest-width integer types */
+#define INTMAX_MIN INT64_MIN
+#define INTMAX_MAX INT64_MAX
+#define UINTMAX_MAX UINT64_MAX
+
+/* 7.18.3  Limits of other integer types */
+#define PTRDIFF_MIN INT32_MIN
+#define PTRDIFF_MAX INT32_MAX
+
+#define SIG_ATOMIC_MIN INT32_MIN
+#define SIG_ATOMIC_MAX INT32_MAX
+
+#define SIZE_MAX UINT32_MAX
+
+#ifndef WCHAR_MIN  /* also in wchar.h */ 
+#define WCHAR_MIN 0
+#define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */
+#endif
+
+/*
+ * wint_t is unsigned short for compatibility with MS runtime
+ */
+#define WINT_MIN 0
+#define WINT_MAX ((wint_t)-1) /* UINT16_MAX */
+
+#endif /* !defined ( __cplusplus) || defined __STDC_LIMIT_MACROS */
+
+
+/* 7.18.4  Macros for integer constants */
+#if !defined ( __cplusplus) || defined (__STDC_CONSTANT_MACROS)
+
+/* 7.18.4.1  Macros for minimum-width integer constants
+
+    Accoding to Douglas Gwyn <gwyn@arl.mil>:
+	"This spec was changed in ISO/IEC 9899:1999 TC1; in ISO/IEC
+	9899:1999 as initially published, the expansion was required
+	to be an integer constant of precisely matching type, which
+	is impossible to accomplish for the shorter types on most
+	platforms, because C99 provides no standard way to designate
+	an integer constant with width less than that of type int.
+	TC1 changed this to require just an integer constant
+	*expression* with *promoted* type."
+
+	The trick used here is from Clive D W Feather.
+*/
+
+#define INT8_C(val) (INT_LEAST8_MAX-INT_LEAST8_MAX+(val))
+#define INT16_C(val) (INT_LEAST16_MAX-INT_LEAST16_MAX+(val))
+#define INT32_C(val) (INT_LEAST32_MAX-INT_LEAST32_MAX+(val))
+#define INT64_C(val) (INT_LEAST64_MAX-INT_LEAST64_MAX+(val))
+
+#define UINT8_C(val) (UINT_LEAST8_MAX-UINT_LEAST8_MAX+(val))
+#define UINT16_C(val) (UINT_LEAST16_MAX-UINT_LEAST16_MAX+(val))
+#define UINT32_C(val) (UINT_LEAST32_MAX-UINT_LEAST32_MAX+(val))
+#define UINT64_C(val) (UINT_LEAST64_MAX-UINT_LEAST64_MAX+(val))
+
+/* 7.18.4.2  Macros for greatest-width integer constants */
+#define INTMAX_C(val) (INTMAX_MAX-INTMAX_MAX+(val))
+#define UINTMAX_C(val) (UINTMAX_MAX-UINTMAX_MAX+(val))
+
+#endif  /* !defined ( __cplusplus) || defined __STDC_CONSTANT_MACROS */
+
+#endif
diff --git a/testing/checkasm.c b/testing/checkasm.c
new file mode 100644
index 00000000..ec3283ba
--- /dev/null
+++ b/testing/checkasm.c
@@ -0,0 +1,347 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#
+#include "core/common.h"
+#ifdef HAVE_MMXEXT
+#include "core/i386/pixel.h"
+#include "core/i386/dct.h"
+#include "core/i386/mc.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#include "core/ppc/pixel.h"
+#endif
+
+/* buf1, buf2: initialised to randome data and shouldn't write into them */
+uint8_t * buf1, * buf2;
+/* buf3, buf4: used to store output */
+uint8_t * buf3, * buf4;
+
+static int check_pixel()
+{
+    x264_pixel_function_t pixel_c = {{0},{0},{0}};
+    x264_pixel_function_t pixel_asm = {{0}, {0},{0}};
+    int ret = 0, ok;
+    int i;
+
+    memset( &pixel_asm, 0, sizeof( x264_pixel_function_t ) );
+    x264_pixel_init( 0, &pixel_c );
+#ifdef HAVE_MMXEXT
+    x264_pixel_init( X264_CPU_MMX|X264_CPU_MMXEXT, &pixel_asm );
+#endif
+#ifdef HAVE_ALTIVEC
+    x264_pixel_altivec_init( &pixel_asm );
+#endif
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        int res_c, res_asm;
+        if( pixel_asm.sad[i] )
+        {
+            res_c   = pixel_c.sad[i]( buf1, 32, buf2, 32 );
+            res_asm =  pixel_asm.sad[i]( buf1, 32, buf2, 32 );
+            if( res_c != res_asm )
+            {
+                ok = 0;
+                fprintf( stderr, "sad[%d]: %d != %d [FAILED]\n", i, res_c, res_asm );
+            }
+        }
+    }
+    if( ok )
+        fprintf( stderr, " - pixel sad :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel sat :           [FAILED]\n" );
+    }
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        int res_c, res_asm;
+        if( pixel_asm.satd[i] )
+        {
+            res_c   = pixel_c.satd[i]( buf1, 32, buf2, 32 );
+            res_asm = pixel_asm.satd[i]( buf1, 32, buf2, 32 );
+            if( res_c != res_asm )
+            {
+                ok = 0;
+                fprintf( stderr, "satd[%d]: %d != %d [FAILED]\n", i, res_c, res_asm );
+            }
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - pixel satd :          [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel satd :          [FAILED]\n" );
+    }
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        if( pixel_asm.avg[i] )
+        {
+            memcpy( buf3, buf1, 32*32 );
+            memcpy( buf4, buf1, 32*32 );
+            pixel_c.satd[i]( buf3, 32, buf2, 32 );
+            pixel_asm.satd[i]( buf4, 32, buf2, 32 );
+            if( memcmp( buf3, buf4, 32*32 ) )
+            {
+                ok = 0;
+                fprintf( stderr, "avg[%d]: [FAILED]\n", i );
+            }
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - pixel avg :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel avg :           [FAILED]\n" );
+    }
+
+    return ret;
+}
+
+static int check_dct()
+{
+    x264_dct_function_t dct_c;
+    x264_dct_function_t dct_asm;
+    int ret = 0, ok;
+    int16_t dct1[16][4][4] __attribute((aligned(16)));
+    int16_t dct2[16][4][4] __attribute((aligned(16)));
+
+    memset( &dct_asm, 0, sizeof( dct_asm ) );
+    x264_dct_init( 0, &dct_c );
+#ifdef HAVE_MMXEXT
+    x264_dct_init( X264_CPU_MMX|X264_CPU_MMXEXT, &dct_asm );
+#endif
+#define TEST_DCT( name, t1, t2, size ) \
+    if( dct_asm.name ) \
+    { \
+        dct_c.name( t1, buf1, 32, buf2, 32 ); \
+        dct_asm.name( t2, buf1, 32, buf2, 32 ); \
+        if( memcmp( t1, t2, size ) ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, #name " [FAILED]\n" ); \
+        } \
+    }
+    ok = 1;
+    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
+    TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
+    TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
+    if( ok )
+        fprintf( stderr, " - sub_dctXxX :          [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - sub_dctXxX :          [FAILED]\n" );
+    }
+#undef TEST_DCT
+
+#define TEST_IDCT( name, t ) \
+    if( dct_asm.name ) \
+    { \
+        memcpy( buf3, buf1, 32*32 ); \
+        memcpy( buf4, buf1, 32*32 ); \
+        dct_c.name( buf3, 32, t ); \
+        dct_asm.name( buf4, 32, t ); \
+        if( memcmp( buf3, buf4, 32*32 ) ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, #name " [FAILED]\n" ); \
+        } \
+    }
+    ok = 1;
+    TEST_IDCT( add4x4_idct, dct1[0] );
+    TEST_IDCT( add8x8_idct, dct1 );
+    TEST_IDCT( add16x16_idct, dct1 );
+    if( ok )
+        fprintf( stderr, " - add_idctXxX :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - add_idctXxX :         [FAILED]\n" );
+    }
+#undef TEST_IDCT
+
+    ok = 1;
+    if( dct_asm.dct4x4dc )
+    {
+        int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+        int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+
+        dct_c.dct4x4dc( dct1 );
+        dct_asm.dct4x4dc( dct2 );
+        if( memcmp( dct1, dct2, 32 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - dct4x4dc :        [FAILED]\n" );
+        }
+    }
+    if( dct_asm.idct4x4dc )
+    {
+        int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+        int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+
+        dct_c.idct4x4dc( dct1 );
+        dct_asm.idct4x4dc( dct2 );
+        if( memcmp( dct1, dct2, 32 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - idct4x4dc :        [FAILED]\n" );
+        }
+    }
+    if( ok )
+        fprintf( stderr, " - (i)dct4x4dc :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - (i)dct4x4dc :         [FAILED]\n" );
+    }
+
+    ok = 1;
+    if( dct_asm.dct2x2dc )
+    {
+        int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+        int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+
+        dct_c.dct2x2dc( dct1 );
+        dct_asm.dct2x2dc( dct2 );
+        if( memcmp( dct1, dct2, 4*2 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - dct2x2dc :        [FAILED]\n" );
+        }
+    }
+    if( dct_asm.idct2x2dc )
+    {
+        int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+        int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+
+        dct_c.idct2x2dc( dct1 );
+        dct_asm.idct2x2dc( dct2 );
+        if( memcmp( dct1, dct2, 4*2 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - idct2x2dc :       [FAILED]\n" );
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - (i)dct2x2dc :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - (i)dct2x2dc :         [FAILED]\n" );
+    }
+
+
+    return ret;
+}
+
+static int check_mc()
+{
+    x264_mc_function_t mc_c[2] = {0};
+    x264_mc_function_t mc_asm[2] = {0};
+    uint8_t *src = &buf1[2*32+2];
+    uint8_t *dst1 = &buf3[2*32+2];
+    uint8_t *dst2 = &buf4[2*32+2];
+    int dx, dy;
+    int ret = 0, ok[2] = { 1, 1 };
+
+    x264_mc_init( 0, mc_c );
+#ifdef HAVE_MMXEXT
+    x264_mc_mmxext_init( mc_asm );
+#endif
+
+    memset( buf3, 0, 32*32 );
+    memset( buf4, 0, 32*32 );
+
+    /* Do the MC */
+#define MC_TEST( t, w, h ) \
+        if( mc_asm[t] ) \
+        { \
+            memset(dst1, 0xCD, (h) * 16); \
+            mc_c[t]( src, 32, dst1, 16, dx, dy, w, h );     \
+            memset(dst2, 0xCD, (h) * 16); \
+            mc_asm[t]( src, 32, dst2, 16, dx, dy, w, h );   \
+            if( memcmp( dst1, dst2, 16*16 ) )               \
+            { \
+                fprintf( stderr, "mc["#t"][mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h );   \
+                ok[t] = 0; \
+            } \
+        }
+
+    for( dy = 0; dy < 4; dy++ )
+    {
+        for( dx = 0; dx < 4; dx++ )
+        {
+            MC_TEST( 0, 16, 16 );
+            MC_TEST( 0, 16, 8 );
+            MC_TEST( 0, 8, 16 );
+            MC_TEST( 0, 8, 8 );
+            MC_TEST( 0, 8, 4 );
+            MC_TEST( 0, 4, 8 );
+            MC_TEST( 0, 4, 4 );
+
+            MC_TEST( 1, 8, 8 );
+            MC_TEST( 1, 8, 4 );
+            MC_TEST( 1, 4, 8 );
+            MC_TEST( 1, 4, 4 );
+            MC_TEST( 1, 4, 2 );
+            MC_TEST( 1, 2, 4 );
+            MC_TEST( 1, 2, 2 );
+        }
+    }
+#undef MC_TEST
+    if( ok[0] )
+        fprintf( stderr, " - mc luma :             [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - mc luma :             [FAILED]\n" );
+    }
+    if( ok[1] )
+        fprintf( stderr, " - mc chroma :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - mc chroma :           [FAILED]\n" );
+    }
+    return ret;
+}
+
+int main()
+{
+    int ret;
+    int i;
+
+#ifdef HAVE_MMXEXT
+    fprintf( stderr, "x264: MMXEXT against C\n" );
+#elif HAVE_ALTIVEC
+    fprintf( stderr, "x264: ALTIVEC against C\n" );
+#endif
+
+    buf1 = x264_malloc( 1024 ); /* 32 x 32 */
+    buf2 = x264_malloc( 1024 );
+    buf3 = x264_malloc( 1024 );
+    buf4 = x264_malloc( 1024 );
+
+    srand( x264_mdate() );
+
+    for( i = 0; i < 1024; i++ )
+    {
+        buf1[i] = rand() % 0xFF;
+        buf2[i] = rand() % 0xFF;
+        buf3[i] = buf4[i] = 0;
+    }
+
+    ret = check_pixel() +
+          check_dct() +
+          check_mc();
+
+    if( ret == 0 )
+    {
+        fprintf( stderr, "x264: All tests passed Yeah :)\n" );
+        return 0;
+    }
+    fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
+    return -1;
+}
+
diff --git a/testing/edge-detec.c b/testing/edge-detec.c
new file mode 100644
index 00000000..e78c71a6
--- /dev/null
+++ b/testing/edge-detec.c
@@ -0,0 +1,2733 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: edge-detec.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "common.h"
+#include "me.h"
+#include "vlc.h"
+
+static inline int x264_median( int a, int b, int c )
+{
+    int min = a, max =a;
+    if( b < min )
+    {
+        min = b;
+    }
+    else
+    {
+        max = b;    /* no need to do 'b > max' (more consuming than always doing affectation) */
+    }
+    if( c < min )
+    {
+        min = c;
+    }
+    else if( c > max )
+    {
+        max = c;
+    }
+
+    return a + b + c - min - max;
+}
+
+static const uint8_t intra4x4_cbp_to_golomb[48]=
+{
+  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+ 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+ 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
+};
+static const uint8_t inter_cbp_to_golomb[48]=
+{
+  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+};
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int quant_mf[6][4][4] =
+{
+    {  { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243},
+       { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}  },
+    {  { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660},
+       { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}  },
+    {  { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194},
+       { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}  },
+    {  {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647},
+       {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}  },
+    {  {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355},
+       {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}  },
+    {  {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893},
+       {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}  }
+};
+
+static const int dequant_mf[6][4][4] =
+{
+    { {10, 13, 10, 13}, {13, 16, 13, 16}, {10, 13, 10, 13}, {13, 16, 13, 16} },
+    { {11, 14, 11, 14}, {14, 18, 14, 18}, {11, 14, 11, 14}, {14, 18, 14, 18} },
+    { {13, 16, 13, 16}, {16, 20, 16, 20}, {13, 16, 13, 16}, {16, 20, 16, 20} },
+    { {14, 18, 14, 18}, {18, 23, 18, 23}, {14, 18, 14, 18}, {18, 23, 18, 23} },
+    { {16, 20, 16, 20}, {20, 25, 20, 25}, {16, 20, 16, 20}, {20, 25, 20, 25} },
+    { {18, 23, 18, 23}, {23, 29, 23, 29}, {18, 23, 18, 23}, {23, 29, 23, 29} }
+};
+
+
+static int predict_pred_intra4x4_mode( x264_t *h, x264_macroblock_t *mb, int idx )
+{
+    x264_macroblock_t *mba = mb->context->block[idx].mba;
+    x264_macroblock_t *mbb = mb->context->block[idx].mbb;
+
+    int i_mode_a = I_PRED_4x4_DC;
+    int i_mode_b = I_PRED_4x4_DC;
+
+    if( !mba || !mbb )
+    {
+        return I_PRED_4x4_DC;
+    }
+
+    if( mba->i_type == I_4x4 )
+    {
+        i_mode_a = mb->context->block[idx].bka->i_intra4x4_pred_mode;
+    }
+    if( mbb->i_type == I_4x4 )
+    {
+        i_mode_b = mb->context->block[idx].bkb->i_intra4x4_pred_mode;
+    }
+
+    return X264_MIN( i_mode_a, i_mode_b );
+}
+
+static int predict_non_zero_code( x264_t *h, x264_macroblock_t *mb, int idx )
+{
+    x264_macroblock_t *mba = mb->context->block[idx].mba;
+    x264_macroblock_t *mbb = mb->context->block[idx].mbb;
+
+    int i_z_a = 0x80, i_z_b = 0x80;
+    int i_ret;
+
+    /* none avail -> 0, one avail -> this one, both -> (a+b+1)>>1 */
+    if( mba )
+    {
+        i_z_a = mb->context->block[idx].bka->i_non_zero_count;
+    }
+    if( mbb )
+    {
+        i_z_b = mb->context->block[idx].bkb->i_non_zero_count;
+    }
+
+    i_ret = i_z_a+i_z_b;
+    if( i_ret < 0x80 )
+    {
+        i_ret = ( i_ret + 1 ) >> 1;
+    }
+    return i_ret & 0x7f;
+}
+
+
+/*
+ * Handle intra mb
+ */
+/* Max = 4 */
+static void predict_16x16_mode_available( x264_macroblock_t *mb, int *mode, int *pi_count )
+{
+    if( ( mb->i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        /* top and left avaible */
+        *mode++ = I_PRED_16x16_DC;
+        *mode++ = I_PRED_16x16_V;
+        *mode++ = I_PRED_16x16_H;
+        *mode++ = I_PRED_16x16_P;
+        *pi_count = 4;
+    }
+    else if( ( mb->i_neighbour & MB_LEFT ) )
+    {
+        /* left available*/
+        *mode++ = I_PRED_16x16_DC_LEFT;
+        *mode++ = I_PRED_16x16_H;
+        *pi_count = 2;
+    }
+    else if( ( mb->i_neighbour & MB_TOP ) )
+    {
+        /* top available*/
+        *mode++ = I_PRED_16x16_DC_TOP;
+        *mode++ = I_PRED_16x16_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        /* none avaible */
+        *mode = I_PRED_16x16_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/* Max = 4 */
+static void predict_8x8_mode_available( x264_macroblock_t *mb, int *mode, int *pi_count )
+{
+    if( ( mb->i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
+    {
+        /* top and left avaible */
+        *mode++ = I_PRED_CHROMA_DC;
+        *mode++ = I_PRED_CHROMA_V;
+        *mode++ = I_PRED_CHROMA_H;
+        *mode++ = I_PRED_CHROMA_P;
+        *pi_count = 4;
+    }
+    else if( ( mb->i_neighbour & MB_LEFT ) )
+    {
+        /* left available*/
+        *mode++ = I_PRED_CHROMA_DC_LEFT;
+        *mode++ = I_PRED_CHROMA_H;
+        *pi_count = 2;
+    }
+    else if( ( mb->i_neighbour & MB_TOP ) )
+    {
+        /* top available*/
+        *mode++ = I_PRED_CHROMA_DC_TOP;
+        *mode++ = I_PRED_CHROMA_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        /* none avaible */
+        *mode = I_PRED_CHROMA_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/* MAX = 8 */
+static void predict_4x4_mode_available( x264_macroblock_t *mb, int idx, int *mode, int *pi_count )
+{
+    int b_a, b_b, b_c;
+    static const int needmb[16] =
+    {
+        MB_LEFT|MB_TOP, MB_TOP,
+        MB_LEFT,        MB_PRIVATE,
+        MB_TOP,         MB_TOP|MB_TOPRIGHT,
+        0,              MB_PRIVATE,
+        MB_LEFT,        0,
+        MB_LEFT,        MB_PRIVATE,
+        0,              MB_PRIVATE,
+        0,              MB_PRIVATE
+    };
+
+    /* FIXME even when b_c == 0 there is some case where missing pixels
+     * are emulated and thus more mode are available TODO
+     * analysis and encode should be fixed too */
+    b_a = (needmb[idx]&mb->i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
+    b_b = (needmb[idx]&mb->i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
+    b_c = (needmb[idx]&mb->i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
+
+    if( b_a && b_b )
+    {
+        *mode++ = I_PRED_4x4_DC;
+        *mode++ = I_PRED_4x4_H;
+        *mode++ = I_PRED_4x4_V;
+        *mode++ = I_PRED_4x4_DDR;
+        *mode++ = I_PRED_4x4_VR;
+        *mode++ = I_PRED_4x4_HD;
+        *mode++ = I_PRED_4x4_HU;
+
+        *pi_count = 7;
+
+        if( b_c )
+        {
+            *mode++ = I_PRED_4x4_DDL;
+            *mode++ = I_PRED_4x4_VL;
+            (*pi_count) += 2;
+        }
+    }
+    else if( b_a && !b_b )
+    {
+        *mode++ = I_PRED_4x4_DC_LEFT;
+        *mode++ = I_PRED_4x4_H;
+        *pi_count = 2;
+    }
+    else if( !b_a && b_b )
+    {
+        *mode++ = I_PRED_4x4_DC_TOP;
+        *mode++ = I_PRED_4x4_V;
+        *pi_count = 2;
+    }
+    else
+    {
+        *mode++ = I_PRED_4x4_DC_128;
+        *pi_count = 1;
+    }
+}
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+}
+static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
+{
+    int i;
+
+    for( i = 1; i < 16; i++ )
+    {
+        level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+}
+
+static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[1][1];
+}
+
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    int i_qbits = 15 + i_qscale / 6;
+    int i_mf = i_qscale % 6;
+    int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    int i_qbits = 15 + i_qscale / 6;
+    int i_mf = i_qscale % 6;
+    int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( 2*f + (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][0][0] ) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( 2*f - (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][0][0] ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int i_qbits = 15 + i_qscale / 6;
+    int i_mf = i_qscale % 6;
+    int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            /* XXX: is int64_t really needed ? */
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( 2*f + (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][0][0] ) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( 2*f - (int64_t)dct[y][x]  * (int64_t)quant_mf[i_mf][0][0] ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static void dequant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    int i_mf = i_qscale%6;
+    int i_qbits = i_qscale/6;
+    int f;
+    int x,y;
+
+    if( i_qbits <= 1 )
+    {
+        f = 1 << ( 1 - i_qbits );
+    }
+    else
+    {
+        f = 0;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( i_qbits >= 2 )
+            {
+                dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][0][0] ) << (i_qbits - 2);
+            }
+            else
+            {
+                dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][0][0] + f ) >> ( 2 -i_qbits );
+            }
+        }
+    }
+}
+
+static void dequant_2x2_dc( int16_t dct[2][2], int i_qscale )
+{
+    int i_mf = i_qscale%6;
+    int i_qbits = i_qscale/6;
+    int x,y;
+
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( i_qbits >= 1 )
+            {
+                dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][0][0] ) << (i_qbits - 1);
+            }
+            else
+            {
+                dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][0][0] ) >> 1;
+            }
+        }
+    }
+}
+static void dequant_4x4( int16_t dct[4][4], int i_qscale )
+{
+    int i_mf = i_qscale%6;
+    int i_qbits = i_qscale/6;
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][x][y] ) << i_qbits;
+        }
+    }
+}
+
+static inline int array_non_zero_count( int *v, int i_count )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < i_count; i++ )
+    {
+        if( v[i] )
+        {
+            i_nz++;
+        }
+    }
+    return i_nz;
+}
+
+/* TODO : use a table instead */
+static int mb_partition_count( int i_partition )
+{
+    switch( i_partition )
+    {
+        case D_8x8:
+            return 4;
+        case D_16x8:
+        case D_8x16:
+            return 2;
+        case D_16x16:
+            return 1;
+        default:
+            /* should never occur */
+            return 0;
+    }
+}
+
+static int mb_sub_partition_count( int i_partition )
+{
+    switch( i_partition )
+    {
+        case D_L0_4x4:
+        case D_L1_4x4:
+        case D_BI_4x4:
+            return 4;
+        case D_L0_4x8:
+        case D_L1_4x8:
+        case D_BI_4x8:
+        case D_L0_8x4:
+        case D_L1_8x4:
+        case D_BI_8x4:
+            return 2;
+        case D_L0_8x8:
+        case D_L1_8x8:
+        case D_BI_8x8:
+        case D_DIRECT_8x8:
+            return 1;
+        default:
+            /* should never occur */
+            return 0;
+    }
+}
+
+static inline void x264_macroblock_partition_getxy( x264_macroblock_t *mb, int i_part, int i_sub, int *x, int *y )
+{
+    if( mb->i_partition == D_16x16 )
+    {
+        *x  = 0;
+        *y  = 0;
+    }
+    else if( mb->i_partition == D_16x8 )
+    {
+        *x = 0;
+        *y = 2*i_part;
+    }
+    else if( mb->i_partition == D_8x16 )
+    {
+        *x = 2*i_part;
+        *y = 0;
+    }
+    else if( mb->i_partition == D_8x8 )
+    {
+        *x = 2 * (i_part%2);
+        *y = 2 * (i_part/2);
+
+        if( IS_SUB4x4( mb->i_sub_partition[i_part] ) )
+        {
+            (*x) += i_sub%2;
+            (*y) += i_sub/2;
+        }
+        else if( IS_SUB4x8( mb->i_sub_partition[i_part] ) )
+        {
+            (*x) += i_sub;
+        }
+        else if( IS_SUB8x4( mb->i_sub_partition[i_part] ) )
+        {
+            (*y) += i_sub;
+        }
+    }
+}
+static inline void x264_macroblock_partition_size( x264_macroblock_t *mb, int i_part, int i_sub, int *w, int *h )
+{
+    if( mb->i_partition == D_16x16 )
+    {
+        *w  = 4;
+        *h  = 4;
+    }
+    else if( mb->i_partition == D_16x8 )
+    {
+        *w = 4;
+        *h = 2;
+    }
+    else if( mb->i_partition == D_8x16 )
+    {
+        *w = 2;
+        *h = 4;
+    }
+    else if( mb->i_partition == D_8x8 )
+    {
+        if( IS_SUB4x4( mb->i_sub_partition[i_part] ) )
+        {
+            *w = 1;
+            *h = 1;
+        }
+        else if( IS_SUB4x8( mb->i_sub_partition[i_part] ) )
+        {
+            *w = 1;
+            *h = 2;
+        }
+        else if( IS_SUB8x4( mb->i_sub_partition[i_part] ) )
+        {
+            *w = 2;
+            *h = 1;
+        }
+        else
+        {
+            *w = 2;
+            *h = 2;
+        }
+    }
+}
+
+void x264_macroblock_partition_set( x264_macroblock_t *mb, int i_list, int i_part, int i_sub, int i_ref, int mx, int my )
+{
+    int x,  y;
+    int w,  h;
+    int dx, dy;
+
+    x264_macroblock_partition_getxy( mb, i_part, i_sub, &x, &y );
+    x264_macroblock_partition_size ( mb, i_part, i_sub, &w, &h );
+
+    for( dx = 0; dx < w; dx++ )
+    {
+        for( dy = 0; dy < h; dy++ )
+        {
+            mb->partition[x+dx][y+dy].i_ref[i_list] = i_ref;
+            mb->partition[x+dx][y+dy].mv[i_list][0] = mx;
+            mb->partition[x+dx][y+dy].mv[i_list][1] = my;
+        }
+    }
+}
+
+void x264_macroblock_partition_get( x264_macroblock_t *mb, int i_list, int i_part, int i_sub, int *pi_ref, int *pi_mx, int *pi_my )
+{
+    int x,y;
+
+    x264_macroblock_partition_getxy( mb, i_part, i_sub, &x, &y );
+
+    if( pi_ref )
+    {
+        *pi_ref = mb->partition[x][y].i_ref[i_list];
+    }
+    if( pi_mx && pi_my )
+    {
+        *pi_mx  = mb->partition[x][y].mv[i_list][0];
+        *pi_my  = mb->partition[x][y].mv[i_list][1];
+    }
+}
+
+/* ARrrrg so unbeautifull, and unoptimised for common case */
+void x264_macroblock_predict_mv( x264_macroblock_t *mb, int i_list, int i_part, int i_subpart, int *mvxp, int *mvyp )
+{
+    int x, y, xn, yn;
+    int w, h;
+    int i_ref;
+
+    int i_refa = -1;
+    int i_refb = -1;
+    int i_refc = -1;
+
+    int mvxa = 0, mvxb = 0, mvxc = 0;
+    int mvya = 0, mvyb = 0, mvyc = 0;
+
+    x264_macroblock_t *mbn;
+
+
+    x264_macroblock_partition_getxy( mb, i_part, i_subpart, &x, &y );
+    x264_macroblock_partition_size( mb, i_part, i_subpart, &w, &h );
+    i_ref = mb->partition[x][y].i_ref[i_list];
+
+    /* Left  pixel (-1,0)*/
+    xn = x - 1;
+    mbn = mb;
+    if( xn < 0 )
+    {
+        xn += 4;
+        mbn = mb->mba;
+    }
+    if( mbn )
+    {
+        i_refa = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refa = mbn->partition[xn][y].i_ref[i_list];
+            mvxa   = mbn->partition[xn][y].mv[i_list][0];
+            mvya   = mbn->partition[xn][y].mv[i_list][1];
+        }
+    }
+
+    /* Up ( pixel(0,-1)*/
+    yn = y - 1;
+    mbn = mb;
+    if( yn < 0 )
+    {
+        yn += 4;
+        mbn = mb->mbb;
+    }
+    if( mbn )
+    {
+        i_refb = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refb = mbn->partition[x][yn].i_ref[i_list];
+            mvxb   = mbn->partition[x][yn].mv[i_list][0];
+            mvyb   = mbn->partition[x][yn].mv[i_list][1];
+        }
+    }
+
+    /* Up right pixel(width,-1)*/
+    xn = x + w;
+    yn = y - 1;
+
+    mbn = mb;
+    if( yn < 0 && xn >= 4 )
+    {
+        if( mb->mbc )
+        {
+            xn -= 4;
+            yn += 4;
+            mbn = mb->mbc;
+        }
+        else
+        {
+            mbn = NULL;
+        }
+    }
+    else if( yn < 0 )
+    {
+        yn += 4;
+        mbn = mb->mbb;
+    }
+    else if( xn >= 4 || ( xn == 2 && ( yn == 0 || yn == 2 ) ) )
+    {
+        mbn = NULL; /* not yet decoded */
+    }
+
+    if( mbn == NULL )
+    {
+        /* load top left pixel(-1,-1) */
+        xn = x - 1;
+        yn = y - 1;
+
+        mbn = mb;
+        if( yn < 0 && xn < 0 )
+        {
+            if( mb->mba && mb->mbb )
+            {
+                xn += 4;
+                yn += 4;
+                mbn = mb->mbb - 1;
+            }
+            else
+            {
+                mbn = NULL;
+            }
+        }
+        else if( yn < 0 )
+        {
+            yn += 4;
+            mbn = mb->mbb;
+        }
+        else if( xn < 0 )
+        {
+            xn += 4;
+            mbn = mb->mba;
+        }
+    }
+
+    if( mbn )
+    {
+        i_refc = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refc = mbn->partition[xn][yn].i_ref[i_list];
+            mvxc   = mbn->partition[xn][yn].mv[i_list][0];
+            mvyc   = mbn->partition[xn][yn].mv[i_list][1];
+        }
+    }
+
+    if( mb->i_partition == D_16x8 && i_part == 0 && i_refb == i_ref )
+    {
+        *mvxp = mvxb;
+        *mvyp = mvyb;
+    }
+    else if( mb->i_partition == D_16x8 && i_part == 1 && i_refa == i_ref )
+    {
+        *mvxp = mvxa;
+        *mvyp = mvya;
+    }
+    else if( mb->i_partition == D_8x16 && i_part == 0 && i_refa == i_ref )
+    {
+        *mvxp = mvxa;
+        *mvyp = mvya;
+    }
+    else if( mb->i_partition == D_8x16 && i_part == 1 && i_refc == i_ref )
+    {
+        *mvxp = mvxc;
+        *mvyp = mvyc;
+    }
+    else
+    {
+        int i_count;
+
+        i_count = 0;
+        if( i_refa == i_ref ) i_count++;
+        if( i_refb == i_ref ) i_count++;
+        if( i_refc == i_ref ) i_count++;
+
+        if( i_count > 1 )
+        {
+            *mvxp = x264_median( mvxa, mvxb, mvxc );
+            *mvyp = x264_median( mvya, mvyb, mvyc );
+        }
+        else if( i_count == 1 )
+        {
+            if( i_refa == i_ref )
+            {
+                *mvxp = mvxa;
+                *mvyp = mvya;
+            }
+            else if( i_refb == i_ref )
+            {
+                *mvxp = mvxb;
+                *mvyp = mvyb;
+            }
+            else
+            {
+                *mvxp = mvxc;
+                *mvyp = mvyc;
+            }
+        }
+        else if( i_refb == -1 && i_refc == -1 && i_refa != -1 )
+        {
+            *mvxp = mvxa;
+            *mvyp = mvya;
+        }
+        else
+        {
+            *mvxp = x264_median( mvxa, mvxb, mvxc );
+            *mvyp = x264_median( mvya, mvyb, mvyc );
+        }
+    }
+}
+
+void x264_macroblock_predict_mv_pskip( x264_macroblock_t *mb, int *mvxp, int *mvyp )
+{
+    int x, y, xn, yn;
+
+    int i_refa = -1;
+    int i_refb = -1;
+
+    int mvxa = 0, mvxb = 0;
+    int mvya = 0, mvyb = 0;
+
+    x264_macroblock_t *mbn;
+
+
+    x264_macroblock_partition_getxy( mb, 0, 0, &x, &y );
+
+    /* Left  pixel (-1,0)*/
+    xn = x - 1;
+    mbn = mb;
+    if( xn < 0 )
+    {
+        xn += 4;
+        mbn = mb->mba;
+    }
+    if( mbn )
+    {
+        i_refa = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refa = mbn->partition[xn][y].i_ref[0];
+            mvxa   = mbn->partition[xn][y].mv[0][0];
+            mvya   = mbn->partition[xn][y].mv[0][1];
+        }
+    }
+
+    /* Up ( pixel(0,-1)*/
+    yn = y - 1;
+    mbn = mb;
+    if( yn < 0 )
+    {
+        yn += 4;
+        mbn = mb->mbb;
+    }
+    if( mbn )
+    {
+        i_refb = -2;
+        if( !IS_INTRA( mbn->i_type ) )
+        {
+            i_refb = mbn->partition[x][yn].i_ref[0];
+            mvxb   = mbn->partition[x][yn].mv[0][0];
+            mvyb   = mbn->partition[x][yn].mv[0][1];
+        }
+    }
+
+    if( i_refa == -1 || i_refb == -1 ||
+        ( i_refa == 0 && mvxa == 0 && mvya == 0 ) ||
+        ( i_refb == 0 && mvxb == 0 && mvyb == 0 ) )
+    {
+        *mvxp = 0;
+        *mvyp = 0;
+    }
+    else
+    {
+        x264_macroblock_predict_mv( mb, 0, 0, 0, mvxp, mvyp );
+    }
+}
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+static void x264_macroblock_mc( x264_t *h, x264_macroblock_t *mb, int b_luma )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    int ch;
+    int i_ref;
+    int mx, my;
+
+    if( mb->i_type == P_L0 )
+    {
+        int i_part;
+
+        for( i_part = 0; i_part < mb_partition_count( mb->i_partition ); i_part++ )
+        {
+            int i_width, i_height;
+            int x, y;
+
+            x264_macroblock_partition_get( mb, 0, i_part, 0, &i_ref, &mx, &my );
+            x264_macroblock_partition_getxy( mb, i_part, 0, &x, &y );
+            x264_macroblock_partition_size(  mb, i_part, 0, &i_width, &i_height );
+
+            if( b_luma )
+            {
+                int     i_src = ctx->i_fref0[i_ref][0];
+                uint8_t *p_src= ctx->p_fref0[i_ref][0];
+                int     i_dst = ctx->i_fdec[0];
+                uint8_t *p_dst= ctx->p_fdec[0];
+
+                h->mc[MC_LUMA]( &p_src[4*(x+y*i_src)], i_src,
+                                &p_dst[4*(x+y*i_dst)], i_dst,
+                                mx, my, 4*i_width, 4*i_height );
+            }
+            else
+            {
+                int     i_src,  i_dst;
+                uint8_t *p_src, *p_dst;
+
+                for( ch = 0; ch < 2; ch++ )
+                {
+                    i_src = ctx->i_fref0[i_ref][1+ch];
+                    p_src = ctx->p_fref0[i_ref][1+ch];
+                    i_dst = ctx->i_fdec[1+ch];
+                    p_dst = ctx->p_fdec[1+ch];
+
+                    h->mc[MC_CHROMA]( &p_src[2*(x+y*i_src)], i_src,
+                                      &p_dst[2*(x+y*i_dst)], i_dst,
+                                      mx, my, 2*i_width, 2*i_height );
+                }
+            }
+        }
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        int i_part;
+
+        for( i_part = 0; i_part < 4; i_part++ )
+        {
+            int i_sub;
+
+            for( i_sub = 0; i_sub < mb_sub_partition_count( mb->i_sub_partition[i_part] ); i_sub++ )
+            {
+                int i_width, i_height;
+                int x, y;
+
+                x264_macroblock_partition_get(   mb, 0, i_part, i_sub, &i_ref, &mx, &my );
+                x264_macroblock_partition_getxy( mb, i_part, i_sub, &x, &y );
+                x264_macroblock_partition_size(  mb, i_part, i_sub, &i_width, &i_height );
+
+                if( b_luma )
+                {
+                    int     i_src = ctx->i_fref0[i_ref][0];
+                    uint8_t *p_src= ctx->p_fref0[i_ref][0];
+                    int     i_dst = ctx->i_fdec[0];
+                    uint8_t *p_dst= ctx->p_fdec[0];
+
+                    h->mc[MC_LUMA]( &p_src[4*(x+y*i_src)], i_src,
+                                    &p_dst[4*(x+y*i_dst)], i_dst,
+                                    mx, my, 4*i_width, 4*i_height );
+                }
+                else
+                {
+                    int     i_src,  i_dst;
+                    uint8_t *p_src, *p_dst;
+
+                    for( ch = 0; ch < 2; ch++ )
+                    {
+                        i_src = ctx->i_fref0[i_ref][1+ch];
+                        p_src = ctx->p_fref0[i_ref][1+ch];
+                        i_dst = ctx->i_fdec[1+ch];
+                        p_dst = ctx->p_fdec[1+ch];
+
+                        h->mc[MC_CHROMA]( &p_src[2*(x+y*i_src)], i_src,
+                                          &p_dst[2*(x+y*i_dst)], i_dst,
+                                          mx, my, 2*i_width, 2*i_height );
+                    }
+                }
+            }
+        }
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_neighbour_load:
+ *****************************************************************************/
+void x264_macroblock_context_load( x264_t *h, x264_macroblock_t *mb, x264_mb_context_t *context )
+{
+    int i;
+    int x, y;
+    x264_macroblock_t *a = NULL;
+    x264_macroblock_t *b = NULL;
+
+    if( mb->i_neighbour&MB_LEFT )
+    {
+        a = mb - 1;
+    }
+    if( mb->i_neighbour&MB_TOP )
+    {
+        b = mb - h->sps.i_mb_width;
+    }
+#define LOAD_PTR( dst, src ) \
+    context->p_##dst[0] = (src)->plane[0] + 16 * ( mb->i_mb_x + mb->i_mb_y * (src)->i_stride[0] ); \
+    context->p_##dst[1] = (src)->plane[1] +  8 * ( mb->i_mb_x + mb->i_mb_y * (src)->i_stride[1] ); \
+    context->p_##dst[2] = (src)->plane[2] +  8 * ( mb->i_mb_x + mb->i_mb_y * (src)->i_stride[2] ); \
+    context->i_##dst[0] = (src)->i_stride[0]; \
+    context->i_##dst[1] = (src)->i_stride[1]; \
+    context->i_##dst[2] = (src)->i_stride[2]
+
+    LOAD_PTR( img,  h->picture );
+    LOAD_PTR( fdec, h->fdec );
+    for( i = 0; i < h->i_ref0; i++ )
+    {
+        LOAD_PTR( fref0[i], h->fref0[i] );
+    }
+    for( i = 0; i < h->i_ref1; i++ )
+    {
+        LOAD_PTR( fref1[i], h->fref1[i] );
+    }
+#undef LOAD_PTR
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            int idx;
+            int xa, yb;
+            x264_macroblock_t *mba;
+            x264_macroblock_t *mbb;
+
+            idx = block_idx_xy[x][y];
+            mba = mb;
+            mbb = mb;
+
+            xa = x - 1;
+            if (xa < 0 )
+            {
+                xa += 4;
+                mba = a;
+            }
+            /* up */
+            yb = y - 1;
+            if (yb < 0 )
+            {
+                yb += 4;
+                mbb = b;
+            }
+
+            context->block[idx].mba = mba;
+            context->block[idx].mbb = mbb;
+            context->block[idx].bka = mba ? &mba->block[block_idx_xy[xa][y]] : NULL;
+            context->block[idx].bkb = mbb ? &mbb->block[block_idx_xy[x][yb]] : NULL;
+
+            if( x < 2 && y < 2 )
+            {
+                int ch;
+                if( xa > 1 ) xa -= 2;   /* we have wrap but here step is 2 not 4 */
+                if( yb > 1 ) yb -= 2;   /* idem */
+
+                for( ch = 0; ch < 2; ch++ )
+                {
+                    context->block[16+4*ch+idx].mba = mba;
+                    context->block[16+4*ch+idx].mbb = mbb;
+                    context->block[16+4*ch+idx].bka = mba ? &mba->block[16+4*ch+block_idx_xy[xa][y]] : NULL;
+                    context->block[16+4*ch+idx].bkb = mbb ? &mbb->block[16+4*ch+block_idx_xy[x][yb]] : NULL;
+                }
+            }
+        }
+    }
+
+    mb->context = context;
+}
+
+/* (ref: JVT-B118)
+ * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
+ * to 0 (low score means set it to null)
+ * Used in inter macroblock (luma and chroma)
+ *  luma: for a 8x8 block: if score < 4 -> null
+ *        for the complete mb: if score < 6 -> null
+ *  chroma: for the complete mb: if score < 7 -> null
+ */
+static int x264_mb_decimate_score( int *dct, int i_max )
+{
+    static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int i_score = 0;
+    int idx = i_max - 1;
+
+    while( idx >= 0 && dct[idx] == 0 )
+    {
+        idx--;
+    }
+
+    while( idx >= 0 )
+    {
+        int i_run;
+
+        if( abs( dct[idx--] ) > 1 )
+        {
+            return 9;
+        }
+
+        i_run = 0;
+        while( idx >= 0 && dct[idx] == 0 )
+        {
+            idx--;
+            i_run++;
+        }
+        i_score += i_ds_table[i_run];
+    }
+
+    return i_score;
+}
+
+static void x264_mb_encode_4x4( x264_t *h, x264_macroblock_t *mb, int idx, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src = ctx->p_img[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_img[0];
+    int      i_src = ctx->i_img[0];
+    uint8_t *p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+    int      i_dst = ctx->i_fdec[0];
+
+    int16_t luma[4][4];
+    int16_t dct4x4[4][4];
+
+    /* we calculate diff */
+    h->pixf.sub4x4( luma, p_src, i_src, p_dst, i_dst );
+
+    /* calculate dct coeffs */
+    h->dctf.dct4x4( dct4x4, luma );
+    quant_4x4( dct4x4, i_qscale, 1 );
+
+    scan_zigzag_4x4full( mb->block[idx].luma4x4, dct4x4 );
+
+    /* output samples to fdec */
+    dequant_4x4( dct4x4, i_qscale );
+    h->dctf.idct4x4( luma, dct4x4 );
+
+    /* put pixel to fdec */
+    h->pixf.add4x4( p_dst, i_dst, luma );
+}
+
+static void x264_mb_encode_i16x16( x264_t *h, x264_macroblock_t *mb, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src = ctx->p_img[0];
+    int      i_src = ctx->i_img[0];
+    uint8_t *p_dst = ctx->p_fdec[0];
+    int      i_dst = ctx->i_fdec[0];
+
+    int16_t luma[16][4][4];
+    int16_t dct4x4[16+1][4][4];
+
+    int i;
+
+    /* calculate the diff */
+    h->pixf.sub16x16( luma, p_src, i_src, p_dst, i_dst );
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        h->dctf.dct4x4( dct4x4[i+1], luma[i] );
+
+        /* copy dc coeff */
+        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
+
+        quant_4x4( dct4x4[1+i], i_qscale, 1 );
+        scan_zigzag_4x4( mb->block[i].residual_ac, dct4x4[1+i] );
+    }
+
+    h->dctf.dct4x4dc( dct4x4[0], dct4x4[0] );
+    quant_4x4_dc( dct4x4[0], i_qscale, 1 );
+    scan_zigzag_4x4full( mb->luma16x16_dc, dct4x4[0] );
+
+    /* output samples to fdec */
+    h->dctf.idct4x4dc( dct4x4[0], dct4x4[0] );
+    dequant_4x4_dc( dct4x4[0], i_qscale );  /* XXX not inversed */
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        dequant_4x4( dct4x4[1+i], i_qscale );
+
+        /* copy dc coeff */
+        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+
+        h->dctf.idct4x4( luma[i], dct4x4[i+1] );
+    }
+    /* put pixels to fdec */
+    h->pixf.add16x16( p_dst, i_dst, luma );
+}
+
+static void x264_mb_encode_8x8( x264_t *h, x264_macroblock_t *mb, int b_inter, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src, *p_dst;
+    int      i_src, i_dst;
+
+    int i, ch;
+    int i_decimate_score = 0;
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        int16_t chroma[4][4][4];
+        int16_t dct2x2[2][2];
+        int16_t dct4x4[4][4][4];
+
+        p_src = ctx->p_img[1+ch];
+        i_src = ctx->i_img[1+ch];
+        p_dst = ctx->p_fdec[1+ch];
+        i_dst = ctx->i_fdec[1+ch];
+
+        /* calculate the diff */
+        h->pixf.sub8x8( chroma, p_src, i_src, p_dst, i_dst );
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            h->dctf.dct4x4( dct4x4[i], chroma[i] );
+
+            /* copy dc coeff */
+            dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+
+            quant_4x4( dct4x4[i], i_qscale, 1 );
+            scan_zigzag_4x4( mb->block[16+i+ch*4].residual_ac, dct4x4[i] );
+
+            i_decimate_score += x264_mb_decimate_score( mb->block[16+i+ch*4].residual_ac, 15 );
+        }
+
+        h->dctf.dct2x2dc( dct2x2, dct2x2 );
+        quant_2x2_dc( dct2x2, i_qscale, 1 );
+        scan_zigzag_2x2_dc( mb->chroma_dc[ch], dct2x2 );
+
+        if( i_decimate_score < 7 && b_inter )
+        {
+            /* Near null chroma 8x8 block so make it null (bits saving) */
+            for( i = 0; i < 4; i++ )
+            {
+                int x, y;
+                for( x = 0; x < 15; x++ )
+                {
+                    mb->block[16+i+ch*4].residual_ac[x] = 0;
+                }
+                for( x = 0; x < 4; x++ )
+                {
+                    for( y = 0; y < 4; y++ )
+                    {
+                        dct4x4[i][x][y] = 0;
+                    }
+                }
+            }
+        }
+
+        /* output samples to fdec */
+        h->dctf.idct2x2dc( dct2x2, dct2x2 );
+        dequant_2x2_dc( dct2x2, i_qscale );  /* XXX not inversed */
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            dequant_4x4( dct4x4[i], i_qscale );
+
+            /* copy dc coeff */
+            dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+
+            h->dctf.idct4x4( chroma[i], dct4x4[i] );
+        }
+        h->pixf.add8x8( p_dst, i_dst, chroma );
+    }
+}
+
+static int x264_mb_pred_mode4x4_fix( int i_mode )
+{
+    if( i_mode == I_PRED_4x4_DC_LEFT || i_mode == I_PRED_4x4_DC_TOP || i_mode == I_PRED_4x4_DC_128 )
+    {
+        return I_PRED_4x4_DC;
+    }
+    return i_mode;
+}
+static int x264_mb_pred_mode16x16_fix( int i_mode )
+{
+    if( i_mode == I_PRED_16x16_DC_LEFT || i_mode == I_PRED_16x16_DC_TOP || i_mode == I_PRED_16x16_DC_128 )
+    {
+        return I_PRED_16x16_DC;
+    }
+    return i_mode;
+}
+static int x264_mb_pred_mode8x8_fix( int i_mode )
+{
+    if( i_mode == I_PRED_CHROMA_DC_LEFT || i_mode == I_PRED_CHROMA_DC_TOP || i_mode == I_PRED_CHROMA_DC_128 )
+    {
+        return I_PRED_CHROMA_DC;
+    }
+    return i_mode;
+}
+
+typedef struct
+{
+    /* conduct the analysis using this lamda and QP */
+    int i_lambda;
+    int i_qp;
+
+    /* Edge histogramme (only luma) */
+    int i_edge_4x4[4][4][9];    /* mode 2 isn't calculated (DC) */
+    int i_edge_16x16[4];        /* mode 2 isn't calculated (DC) */
+
+    /* I: Intra part */
+    /* Luma part 16x16 and 4x4 modes stats */
+    int i_sad_i16x16;
+    int i_predict16x16;
+
+    int i_sad_i4x4;
+    int i_predict4x4[4][4];
+
+    /* Chroma part */
+    int i_sad_i8x8;
+    int i_predict8x8;
+
+    /* II: Inter part */
+    int i_sad_p16x16;
+    int i_ref_p16x16;
+    int i_mv_p16x16[2];
+
+    int i_sad_p16x8;
+    int i_ref_p16x8;
+    int i_mv_p16x8[2][2];
+
+    int i_sad_p8x16;
+    int i_ref_p8x16;
+    int i_mv_p8x16[2][2];
+
+    int i_sad_p8x8;
+    int i_ref_p8x8;
+    int i_sub_partition_p8x8[4];
+    int i_mv_p8x8[4][4][2];
+
+} x264_mb_analysis_t;
+
+
+static const int i_qp0_cost_table[52] =
+{
+   1, 1, 1, 1, 1, 1, 1, 1,
+   1, 1, 1, 1,
+   1, 1, 1, 1, 2, 2, 2, 2,
+   3, 3, 3, 4, 4, 4, 5, 6,
+   6, 7, 8, 9,10,11,13,14,
+  16,18,20,23,25,29,32,36,
+  40,45,51,57,64,72,81,91
+};
+
+
+static void x264_macroblock_analyse_edge( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    uint8_t *p_img = mb->context->p_img[0];;
+    int      i_img = mb->context->i_img[0];
+
+    int dx, dy;
+    int x,  y;
+    int i;
+
+#define FIX8( f ) ( (int)((f) * 256))
+    /* init stats (16x16) */
+    for( i = 0; i < 4; i++ )
+    {
+        res->i_edge_16x16[i] = 0;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            /* init stats (4x4) */
+            for( i = 0; i < 9; i++ )
+            {
+                res->i_edge_4x4[y][x][i] = 0;
+            }
+
+            /* FIXME real interval 0-4 except for border mb */
+            for( dy = (y==0 ? 1:0); dy < (y==3?3:4); dy++ )
+            {
+                for( dx = (x==0?1:0); dx < (x==3?3:4); dx++ )
+                {
+                    uint8_t *pix = &p_img[(y*4+dy)*i_img+(x+dx)];
+                    int dgx, dgy;
+                    int Ryx;
+                    int Ag;
+                    int Dg;
+
+
+                    dgx = (pix[-1*i_img-1]+2*pix[-1*i_img+0]+pix[-1*i_img+1]) -
+                          (pix[ 1*i_img-1]+2*pix[ 1*i_img+0]+pix[ 1*i_img+1]);
+
+
+                    dgy = (pix[-1*i_img+1]+2*pix[ 0*i_img+1]+pix[ 1*i_img+1]) -
+                          (pix[-1*i_img-1]+2*pix[ 0*i_img-1]+pix[ 1*i_img-1]);
+
+                    /* XXX angle to test/verify */
+                    Ag = abs( dgx ) + abs( dgy );
+
+                    if( dgx == 0 )
+                    {
+                        Ryx = (4*256)<<8;
+                    }
+                    else
+                    {
+                        Ryx = ( dgy << 8 )/ dgx;
+                    }
+
+                    if( abs(Ryx) >= FIX8(5.027339) )
+                    {
+                        Dg = I_PRED_4x4_V;
+                    }
+                    else if( abs(Ryx) <= FIX8(0.198912) )
+                    {
+                        Dg = I_PRED_4x4_H;
+                    }
+                    else if( Ryx > FIX8(0.198912) && Ryx <= FIX8(0.668179) )
+                    {
+                        Dg = I_PRED_4x4_HD;
+                    }
+                    else if( Ryx > FIX8(0.668179) && Ryx <= FIX8(1.496606) )
+                    {
+                        Dg = I_PRED_4x4_DDR;
+                    }
+                    else if( Ryx > FIX8(1.496606) && Ryx <= FIX8(5.027339) )
+                    {
+                        Dg = I_PRED_4x4_VR;
+                    }
+                    else if( Ryx > FIX8(-5.027339) && Ryx <= FIX8(-1.496606) )
+                    {
+                        Dg = I_PRED_4x4_VL;
+                    }
+                    else if( Ryx > FIX8(-1.496606) && Ryx <= FIX8(-0.668179) )
+                    {
+                        Dg = I_PRED_4x4_DDL;
+                    }
+                    else if( Ryx > FIX8(-0.668179) && Ryx <= FIX8(-0.198912) )
+                    {
+                        Dg = I_PRED_4x4_HU;
+                    }
+                    else
+                    {
+                        /* Should never occur */
+                        fprintf( stderr, "mmh bad edge dectection function\n" );
+                        Dg = I_PRED_4x4_DC;
+                    }
+                    res->i_edge_4x4[y][x][Dg] += Ag;
+
+                    if( abs(Ryx) > FIX8(2.414214) )
+                    {
+                        Dg = I_PRED_16x16_V;
+                    }
+                    else if( abs(Ryx) < FIX8(0.414214) )
+                    {
+                        Dg = I_PRED_16x16_H;
+                    }
+                    else
+                    {
+                        Dg = I_PRED_16x16_P;
+                    }
+                    res->i_edge_16x16[Dg] += Ag;
+                }
+            }
+        }
+    }
+#undef FIX8
+}
+
+static void x264_macroblock_analyse_i16x16( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    uint8_t *p_dst = mb->context->p_fdec[0];
+    uint8_t *p_src = mb->context->p_img[0];
+    int      i_dst = mb->context->i_fdec[0];
+    int      i_src = mb->context->i_img[0];
+
+    int i;
+    int i_max;
+    int predict_mode[4];
+
+    res->i_sad_i16x16 = -1;
+
+    /* 16x16 prediction selection */
+    predict_16x16_mode_available( mb, predict_mode, &i_max );
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_16x16[i_mode]( p_dst, i_dst );
+
+        /* we calculate the diff and get the square sum of the diff */
+        i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_dst, p_src, i_src ) +
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix(i_mode) );
+        /* if i_score is lower it is better */
+        if( res->i_sad_i16x16 == -1 || res->i_sad_i16x16 > i_sad )
+        {
+            res->i_predict16x16 = i_mode;
+            res->i_sad_i16x16     = i_sad;
+        }
+    }
+}
+
+static void x264_macroblock_analyse_i4x4( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    int i, idx;
+
+    int i_max;
+    int predict_mode[9];
+
+    uint8_t *p_dst = mb->context->p_fdec[0];
+    uint8_t *p_src = mb->context->p_img[0];
+    int      i_dst = mb->context->i_fdec[0];
+    int      i_src = mb->context->i_img[0];
+
+    res->i_sad_i4x4 = 0;
+
+    /* 4x4 prediction selection */
+    for( idx = 0; idx < 16; idx++ )
+    {
+        uint8_t *p_src_by;
+        uint8_t *p_dst_by;
+        int     i_best;
+        int x, y;
+        int i_pred_mode;
+        int i_th;
+
+        i_pred_mode= predict_pred_intra4x4_mode( h, mb, idx );
+        x = block_idx_x[idx];
+        y = block_idx_y[idx];
+
+        i_th = res->i_edge_4x4[y][x][0];
+        if( i_th < res->i_edge_4x4[y][x][1] ) i_th = res->i_edge_4x4[y][x][1];
+        if( i_th < res->i_edge_4x4[y][x][3] ) i_th = res->i_edge_4x4[y][x][3];
+        if( i_th < res->i_edge_4x4[y][x][4] ) i_th = res->i_edge_4x4[y][x][4];
+        if( i_th < res->i_edge_4x4[y][x][5] ) i_th = res->i_edge_4x4[y][x][5];
+        if( i_th < res->i_edge_4x4[y][x][6] ) i_th = res->i_edge_4x4[y][x][6];
+        if( i_th < res->i_edge_4x4[y][x][7] ) i_th = res->i_edge_4x4[y][x][7];
+        if( i_th < res->i_edge_4x4[y][x][8] ) i_th = res->i_edge_4x4[y][x][8];
+        i_th /= 2;
+
+        res->i_edge_4x4[y][x][2] = i_th;
+
+        p_src_by = p_src + 4 * x + 4 * y * i_src;
+        p_dst_by = p_dst + 4 * x + 4 * y * i_dst;
+
+        i_best = -1;
+        predict_4x4_mode_available( mb, idx, predict_mode, &i_max );
+        for( i = 0; i < i_max; i++ )
+        {
+            int i_sad;
+            int i_mode;
+            int i_fmode;
+
+            i_mode = predict_mode[i];
+            i_fmode = x264_mb_pred_mode4x4_fix( i_mode );
+
+            if( res->i_edge_4x4[y][x][i_fmode] < i_th )
+            {
+                continue;
+            }
+
+            /* we do the prediction */
+            h->predict_4x4[i_mode]( p_dst_by, i_dst );
+
+            /* we calculate diff and get the square sum of the diff */
+            i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_dst, p_src_by, i_src );
+
+            i_sad += res->i_lambda * (i_pred_mode == i_fmode ? 1 : 4);
+
+            /* if i_score is lower it is better */
+            if( i_best == -1 || i_best > i_sad )
+            {
+                res->i_predict4x4[x][y] = i_mode;
+                i_best = i_sad;
+            }
+        }
+        res->i_sad_i4x4 += i_best;
+
+        /* we need to encode this mb now (for next ones) */
+        mb->block[idx].i_intra4x4_pred_mode = res->i_predict4x4[x][y];
+        h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_dst );
+        x264_mb_encode_4x4( h, mb, idx, res->i_qp );
+    }
+    res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
+}
+
+static void x264_macroblock_analyse_intra_chroma( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    int i;
+
+    int i_max;
+    int predict_mode[9];
+
+    uint8_t *p_dstc[2], *p_srcc[2];
+    int      i_dstc[2], i_srcc[2];
+
+    /* 8x8 prediction selection for chroma */
+    p_dstc[0] = mb->context->p_fdec[1]; i_dstc[0] = mb->context->i_fdec[1];
+    p_dstc[1] = mb->context->p_fdec[2]; i_dstc[1] = mb->context->i_fdec[2];
+    p_srcc[0] = mb->context->p_img[1];  i_srcc[0] = mb->context->i_img[1];
+    p_srcc[1] = mb->context->p_img[2];  i_srcc[1] = mb->context->i_img[2];
+
+    predict_8x8_mode_available( mb, predict_mode, &i_max );
+    res->i_sad_i8x8 = -1;
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_8x8[i_mode]( p_dstc[0], i_dstc[0] );
+        h->predict_8x8[i_mode]( p_dstc[1], i_dstc[1] );
+
+        /* we calculate the cost */
+        i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_dstc[0], p_srcc[0], i_srcc[0] ) +
+                h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_dstc[1], p_srcc[1], i_srcc[1] ) +
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix(i_mode) );
+
+        /* if i_score is lower it is better */
+        if( res->i_sad_i8x8 == -1 || res->i_sad_i8x8 > i_sad )
+        {
+            res->i_predict8x8 = i_mode;
+            res->i_sad_i8x8     = i_sad;
+        }
+    }
+}
+
+static void x264_macroblock_analyse_inter_p8x8( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    x264_mb_context_t *ctx = mb->context;
+    int i_ref = res->i_ref_p16x16;
+
+    uint8_t *p_fref = ctx->p_fref0[i_ref][0];
+    int      i_fref = ctx->i_fref0[i_ref][0];
+    uint8_t *p_img  = ctx->p_img[0];
+    int      i_img  = ctx->i_img[0];
+
+    int i;
+
+    res->i_ref_p8x8 = i_ref;
+    res->i_sad_p8x8 = 0;
+    mb->i_partition = D_8x8;
+
+    for( i = 0; i < 4; i++ )
+    {
+        static const int test8x8_mode[4] = { D_L0_8x8, D_L0_8x4, D_L0_4x8, D_L0_4x4 };
+        static const int test8x8_pix[4]  = { PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 };
+        static const int test8x8_pos_x[4][4] = { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 0, 0 }, { 0, 4, 0, 4 } };
+        static const int test8x8_pos_y[4][4] = { { 0, 0, 0, 0 }, { 0, 4, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 4, 4 } };
+        int i_test;
+        int mvp[4][2];
+        int mv[4][2];
+
+        int x, y;
+        int i_sub;
+        int i_b_satd;
+
+        y = 8 * (i / 2);
+        x = 8 * (i % 2);
+        i_b_satd = -1;
+
+        i_test = 0;
+        /* FIXME as it's tooooooo slow test only 8x8 */
+        //for( i_test = 0; i_test < 4; i_test++ )
+        {
+            int i_satd;
+
+            i_satd = 0;
+
+            mb->i_sub_partition[i] = test8x8_mode[i_test];
+
+            for( i_sub = 0; i_sub < mb_sub_partition_count( test8x8_mode[i_test] ); i_sub++ )
+            {
+                x264_macroblock_predict_mv( mb, 0, i, i_sub, &mvp[i_sub][0], &mvp[i_sub][1] );
+                mv[i_sub][0] = mvp[i_sub][0];
+                mv[i_sub][1] = mvp[i_sub][1];
+
+                i_satd += x264_me_p_umhexagons( h,
+                                                &p_fref[(y+test8x8_pos_y[i_test][i_sub])*i_fref +x+test8x8_pos_x[i_test][i_sub]], i_fref,
+                                                &p_img[(y+test8x8_pos_y[i_test][i_sub])*i_img +x+test8x8_pos_x[i_test][i_sub]], i_img,
+                                                test8x8_pix[i_test],
+                                                res->i_lambda,
+                                                &mv[i_sub][0], &mv[i_sub][1] );
+                i_satd += res->i_lambda * ( bs_size_se( mv[i_sub][0] - mvp[i_sub][0] ) +
+                                            bs_size_se( mv[i_sub][1] - mvp[i_sub][1] ) );
+            }
+
+            switch( test8x8_mode[i_test] )
+            {
+                case D_L0_8x8:
+                    i_satd += res->i_lambda * bs_size_ue( 0 );
+                    break;
+                case D_L0_8x4:
+                    i_satd += res->i_lambda * bs_size_ue( 1 );
+                    break;
+                case D_L0_4x8:
+                    i_satd += res->i_lambda * bs_size_ue( 2 );
+                    break;
+                case D_L0_4x4:
+                    i_satd += res->i_lambda * bs_size_ue( 3 );
+                    break;
+                default:
+                    fprintf( stderr, "internal error (invalid sub type)\n" );
+                    break;
+            }
+
+            if( i_b_satd == -1 || i_b_satd > i_satd )
+            {
+                i_b_satd = i_satd;
+                res->i_sub_partition_p8x8[i] = test8x8_mode[i_test];;
+                for( i_sub = 0; i_sub < mb_sub_partition_count( test8x8_mode[i_test] ); i_sub++ )
+                {
+                    res->i_mv_p8x8[i][i_sub][0] = mv[i_sub][0];
+                    res->i_mv_p8x8[i][i_sub][1] = mv[i_sub][1];
+                }
+            }
+        }
+
+        res->i_sad_p8x8 += i_b_satd;
+        /* needed for the next block */
+        mb->i_sub_partition[i] = res->i_sub_partition_p8x8[i];
+        for( i_sub = 0; i_sub < mb_sub_partition_count( res->i_sub_partition_p8x8[i] ); i_sub++ )
+        {
+            x264_macroblock_partition_set( mb, 0, i, i_sub,
+                                           res->i_ref_p8x8,
+                                           res->i_mv_p8x8[i][i_sub][0],
+                                           res->i_mv_p8x8[i][i_sub][1] );
+        }
+    }
+
+    res->i_sad_p8x8 += 4*res->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+}
+
+static void x264_macroblock_analyse_inter( x264_t *h, x264_macroblock_t *mb, x264_mb_analysis_t *res )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    int i_ref;
+
+    /* int res */
+    res->i_sad_p16x16 = -1;
+    res->i_sad_p16x8  = -1;
+    res->i_sad_p8x16  = -1;
+    res->i_sad_p8x8   = -1;
+
+    /* 16x16 Search on all ref frame */
+    mb->i_type = P_L0;  /* beurk fix that */
+    mb->i_partition = D_16x16;
+    for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
+    {
+        int i_sad;
+        int mvxp, mvyp;
+        int mvx, mvy;
+
+        /* Get the predicted MV */
+        x264_macroblock_partition_set( mb, 0, 0, 0, i_ref, 0, 0 );
+        x264_macroblock_predict_mv( mb, 0, 0, 0, &mvxp, &mvyp );
+
+        mvx = mvxp; mvy = mvyp;
+        i_sad = x264_me_p_umhexagons( h, ctx->p_fref0[i_ref][0], ctx->i_fref0[i_ref][0],
+                                         ctx->p_img[0],         ctx->i_img[0],
+                                         PIXEL_16x16, res->i_lambda, &mvx, &mvy );
+        if( mvx == mvxp && mvy == mvyp )
+        {
+            i_sad -= 16 * res->i_lambda;
+        }
+        else
+        {
+            i_sad += res->i_lambda * (bs_size_se(mvx - mvxp) + bs_size_se(mvy - mvyp));
+        }
+        i_sad += res->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+
+        if( res->i_sad_p16x16 == -1 || i_sad < res->i_sad_p16x16 )
+        {
+            res->i_sad_p16x16   = i_sad;
+            res->i_ref_p16x16   = i_ref;
+            res->i_mv_p16x16[0] = mvx;
+            res->i_mv_p16x16[1] = mvy;
+        }
+    }
+
+    /* Now do the rafinement (using the ref found in 16x16 mode) */
+    i_ref = res->i_ref_p16x16;
+    x264_macroblock_partition_set( mb, 0, 0, 0, i_ref, 0, 0 );
+
+    /* try 16x8 */
+    /* XXX we test i_predict16x16 to try shape with the same direction than edge
+     * We should do a better algo of course (the one with edge dectection to be used
+     * for intra mode too)
+     * */
+
+    if( res->i_predict16x16 != I_PRED_16x16_V )
+    {
+        int mvp[2][2];
+
+        mb->i_partition = D_16x8;
+
+        res->i_ref_p16x8   = i_ref;
+        x264_macroblock_predict_mv( mb, 0, 0, 0, &mvp[0][0], &mvp[0][1] );
+        x264_macroblock_predict_mv( mb, 0, 1, 0, &mvp[1][0], &mvp[1][1] );
+
+        res->i_mv_p16x8[0][0] = mvp[0][0]; res->i_mv_p16x8[0][1] = mvp[0][1];
+        res->i_mv_p16x8[1][0] = mvp[1][0]; res->i_mv_p16x8[1][1] = mvp[1][1];
+
+        res->i_sad_p16x8 = x264_me_p_umhexagons( h,
+                                                 ctx->p_fref0[i_ref][0], ctx->i_fref0[i_ref][0],
+                                                 ctx->p_img[0],          ctx->i_img[0],
+                                                 PIXEL_16x8,
+                                                 res->i_lambda,
+                                                 &res->i_mv_p16x8[0][0], &res->i_mv_p16x8[0][1] ) +
+                           x264_me_p_umhexagons( h,
+                                                 &ctx->p_fref0[i_ref][0][8*ctx->i_fref0[i_ref][0]], ctx->i_fref0[i_ref][0],
+                                                 &ctx->p_img[0][8*ctx->i_img[0]],                   ctx->i_img[0],
+                                                 PIXEL_16x8,
+                                                 res->i_lambda,
+                                                 &res->i_mv_p16x8[1][0], &res->i_mv_p16x8[1][1] );
+
+        res->i_sad_p16x8 += res->i_lambda * ( bs_size_se(res->i_mv_p16x8[0][0] - mvp[0][0] ) +
+                                              bs_size_se(res->i_mv_p16x8[0][1] - mvp[0][1] ) +
+                                              bs_size_se(res->i_mv_p16x8[1][0] - mvp[1][0] ) +
+                                              bs_size_se(res->i_mv_p16x8[1][1] - mvp[1][1] ) );
+
+        res->i_sad_p16x8 += 2*res->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+    }
+
+    /* try 8x16 */
+    if( res->i_predict16x16 != I_PRED_16x16_H )
+    {
+        int mvp[2][2];
+
+        mb->i_partition = D_8x16;
+
+        res->i_ref_p8x16   = i_ref;
+        x264_macroblock_predict_mv( mb, 0, 0, 0, &mvp[0][0], &mvp[0][1] );
+        x264_macroblock_predict_mv( mb, 0, 1, 0, &mvp[1][0], &mvp[1][1] );
+
+        res->i_mv_p8x16[0][0] = mvp[0][0]; res->i_mv_p8x16[0][1] = mvp[0][1];
+        res->i_mv_p8x16[1][0] = mvp[1][0]; res->i_mv_p8x16[1][1] = mvp[1][1];
+
+        res->i_sad_p8x16 = x264_me_p_umhexagons( h,
+                                                 ctx->p_fref0[i_ref][0], ctx->i_fref0[i_ref][0],
+                                                 ctx->p_img[0],          ctx->i_img[0],
+                                                 PIXEL_8x16,
+                                                 res->i_lambda,
+                                                 &res->i_mv_p8x16[0][0], &res->i_mv_p8x16[0][1] ) +
+                           x264_me_p_umhexagons( h,
+                                                 &ctx->p_fref0[i_ref][0][8], ctx->i_fref0[i_ref][0],
+                                                 &ctx->p_img[0][8],          ctx->i_img[0],
+                                                 PIXEL_8x16,
+                                                 res->i_lambda,
+                                                 &res->i_mv_p8x16[1][0], &res->i_mv_p8x16[1][1] );
+
+        res->i_sad_p8x16 += res->i_lambda * ( bs_size_se(res->i_mv_p8x16[0][0] - mvp[0][0] ) +
+                                                bs_size_se(res->i_mv_p8x16[0][1] - mvp[0][1] ) +
+                                                bs_size_se(res->i_mv_p8x16[1][0] - mvp[1][0] ) +
+                                                bs_size_se(res->i_mv_p8x16[1][1] - mvp[1][1] ) );
+        res->i_sad_p8x16 += 2*res->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+    }
+
+    if( 1 )
+    {
+    //    x264_macroblock_analyse_inter_p8x8( h,mb, res );
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_analyse:
+ *****************************************************************************/
+void x264_macroblock_analyse( x264_t *h, x264_macroblock_t *mb, int i_slice_type )
+{
+    x264_mb_analysis_t analysis;
+    int i;
+
+    /* qp TODO */
+    mb->i_qp_delta = 0;
+
+    /* init analysis */
+    analysis.i_qp = x264_clip3( h->pps.i_pic_init_qp + h->sh.i_qp_delta + mb->i_qp_delta, 0, 51 );
+    analysis.i_lambda = i_qp0_cost_table[analysis.i_qp];
+
+    x264_macroblock_analyse_edge( h, mb, &analysis );
+
+    /*--------------------------- Do the analysis ---------------------------*/
+    x264_macroblock_analyse_i16x16( h, mb, &analysis );
+    x264_macroblock_analyse_i4x4  ( h, mb, &analysis );
+    if( i_slice_type == SLICE_TYPE_P )
+    {
+        x264_macroblock_analyse_inter( h, mb, &analysis );
+    }
+
+    /*-------------------- Chose the macroblock mode ------------------------*/
+    /* Do the MB decision */
+    if( i_slice_type == SLICE_TYPE_I )
+    {
+        mb->i_type = analysis.i_sad_i4x4 < analysis.i_sad_i16x16 ? I_4x4 : I_16x16;
+    }
+    else
+    {
+        int i_satd;
+#define BEST_TYPE( type, partition, satd ) \
+        if( satd != -1 && satd < i_satd ) \
+        {   \
+            i_satd = satd;  \
+            mb->i_type = type; \
+            mb->i_partition = partition; \
+        }
+
+        i_satd = analysis.i_sad_i4x4;
+        mb->i_type = I_4x4;
+
+        BEST_TYPE( I_16x16, -1,    analysis.i_sad_i16x16 );
+        BEST_TYPE( P_L0,  D_16x16, analysis.i_sad_p16x16 );
+        BEST_TYPE( P_L0,  D_16x8 , analysis.i_sad_p16x8  );
+        BEST_TYPE( P_L0,  D_8x16 , analysis.i_sad_p8x16  );
+        BEST_TYPE( P_8x8, D_8x8  , analysis.i_sad_p8x8   );
+
+#undef BEST_TYPE
+    }
+
+    if( IS_INTRA( mb->i_type ) )
+    {
+        x264_macroblock_analyse_intra_chroma( h, mb, &analysis );
+    }
+
+    /*-------------------- Update MB from the analysis ----------------------*/
+    switch( mb->i_type )
+    {
+        case I_4x4:
+            for( i = 0; i < 16; i++ )
+            {
+                mb->block[i].i_intra4x4_pred_mode = analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
+            }
+            mb->i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+        case I_16x16:
+            mb->i_intra16x16_pred_mode = analysis.i_predict16x16;
+            mb->i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+        case P_L0:
+            switch( mb->i_partition )
+            {
+                case D_16x16:
+                    x264_macroblock_partition_set( mb, 0, 0, 0,
+                                                   analysis.i_ref_p16x16, analysis.i_mv_p16x16[0], analysis.i_mv_p16x16[1] );
+                    break;
+                case D_16x8:
+                    x264_macroblock_partition_set( mb, 0, 0, 0,
+                                                   analysis.i_ref_p16x8, analysis.i_mv_p16x8[0][0], analysis.i_mv_p16x8[0][1] );
+                    x264_macroblock_partition_set( mb, 0, 1, 0,
+                                                   analysis.i_ref_p16x8, analysis.i_mv_p16x8[1][0], analysis.i_mv_p16x8[1][1] );
+                    break;
+                case D_8x16:
+                    x264_macroblock_partition_set( mb, 0, 0, 0,
+                                                   analysis.i_ref_p8x16, analysis.i_mv_p8x16[0][0], analysis.i_mv_p8x16[0][1] );
+                    x264_macroblock_partition_set( mb, 0, 1, 0,
+                                                   analysis.i_ref_p8x16, analysis.i_mv_p8x16[1][0], analysis.i_mv_p8x16[1][1] );
+                    break;
+                default:
+                    fprintf( stderr, "internal error\n" );
+                    break;
+            }
+            break;
+
+        case P_8x8:
+            for( i = 0; i < 4; i++ )
+            {
+                int i_sub;
+
+                mb->i_sub_partition[i] = analysis.i_sub_partition_p8x8[i];
+                for( i_sub = 0; i_sub < mb_sub_partition_count( mb->i_sub_partition[i] ); i_sub++ )
+                {
+                    x264_macroblock_partition_set( mb, 0, i, i_sub,
+                                                   analysis.i_ref_p8x8,
+                                                   analysis.i_mv_p8x8[i][i_sub][0],
+                                                   analysis.i_mv_p8x8[i][i_sub][1] );
+                }
+            }
+            break;
+
+        default:
+            fprintf( stderr, "internal error\n" );
+            break;
+    }
+}
+
+
+
+/*****************************************************************************
+ * x264_macroblock_encode:
+ *****************************************************************************/
+void x264_macroblock_encode( x264_t *h, x264_macroblock_t *mb )
+{
+    int i;
+
+    int     i_qscale;
+
+    /* quantification scale */
+    i_qscale = x264_clip3( h->pps.i_pic_init_qp + h->sh.i_qp_delta + mb->i_qp_delta, 0, 51 );
+
+    if( mb->i_type == I_16x16 )
+    {
+        /* do the right prediction */
+        h->predict_16x16[mb->i_intra16x16_pred_mode]( mb->context->p_fdec[0], mb->context->i_fdec[0] );
+
+        /* encode the 16x16 macroblock */
+        x264_mb_encode_i16x16( h, mb, i_qscale );
+
+        /* fix the pred mode value */
+        mb->i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix( mb->i_intra16x16_pred_mode );
+    }
+    else if( mb->i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            uint8_t *p_dst_by;
+
+            /* Do the right prediction */
+            p_dst_by = mb->context->p_fdec[0] + 4 * block_idx_x[i] + 4 * block_idx_y[i] * mb->context->i_fdec[0];
+            h->predict_4x4[mb->block[i].i_intra4x4_pred_mode]( p_dst_by, mb->context->i_fdec[0] );
+
+            /* encode one 4x4 block */
+            x264_mb_encode_4x4( h, mb, i, i_qscale );
+
+            /* fix the pred mode value */
+            mb->block[i].i_intra4x4_pred_mode = x264_mb_pred_mode4x4_fix( mb->block[i].i_intra4x4_pred_mode );
+        }
+    }
+    else    /* Inter MB */
+    {
+        x264_mb_context_t *ctx = mb->context;
+        int16_t dct4x4[16][4][4];
+
+        int i8x8, i4x4, idx;
+        int i_decimate_mb = 0;
+
+        /* Motion compensation */
+        x264_macroblock_mc( h, mb, 1 );
+
+        for( i8x8 = 0; i8x8 < 4; i8x8++ )
+        {
+            int16_t luma[4][4];
+            int i_decimate_8x8;
+
+            /* encode one 4x4 block */
+            i_decimate_8x8 = 0;
+            for( i4x4 = 0; i4x4 < 4; i4x4++ )
+            {
+                uint8_t *p_src, *p_dst;
+
+                idx = i8x8 * 4 + i4x4;
+
+                p_src = ctx->p_img[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_img[0];
+                p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+
+                /* we calculate diff */
+                h->pixf.sub4x4( luma, p_src, ctx->i_img[0],p_dst, ctx->i_fdec[0] );
+
+                /* calculate dct coeffs */
+                h->dctf.dct4x4( dct4x4[idx], luma );
+                quant_4x4( dct4x4[idx], i_qscale, 1 );
+
+                scan_zigzag_4x4full( mb->block[idx].luma4x4, dct4x4[idx] );
+                i_decimate_8x8 += x264_mb_decimate_score( mb->block[idx].luma4x4, 16 );
+            }
+
+            /* decimate this 8x8 block */
+            i_decimate_mb += i_decimate_8x8;
+            if( i_decimate_8x8 < 4 )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    int x, y;
+                    idx = i8x8 * 4 + i4x4;
+                    for( i = 0; i < 16; i++ )
+                    {
+                        mb->block[idx].luma4x4[i] = 0;
+                    }
+                    for( x = 0; x < 4; x++ )
+                    {
+                        for( y = 0; y < 4; y++ )
+                        {
+                            dct4x4[idx][x][y] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if( i_decimate_mb < 6 )
+        {
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    for( i = 0; i < 16; i++ )
+                    {
+                        mb->block[i8x8 * 4 + i4x4].luma4x4[i] = 0;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
+            {
+                int16_t luma[4][4];
+                /* TODO we could avoid it if we had decimate this 8x8 block */
+                /* output samples to fdec */
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    uint8_t *p_dst;
+
+                    idx = i8x8 * 4 + i4x4;
+
+                    dequant_4x4( dct4x4[idx], i_qscale );
+                    h->dctf.idct4x4( luma, dct4x4[idx] );
+
+                    /* put pixel to fdec */
+                    p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+                    h->pixf.add4x4( p_dst, ctx->i_fdec[0], luma );
+                }
+            }
+        }
+    }
+
+    /* encode chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps.i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( mb->i_type ) )
+    {
+        /* do the right prediction */
+        h->predict_8x8[mb->i_chroma_pred_mode]( mb->context->p_fdec[1], mb->context->i_fdec[1] );
+        h->predict_8x8[mb->i_chroma_pred_mode]( mb->context->p_fdec[2], mb->context->i_fdec[2] );
+    }
+    else
+    {
+        /* Motion compensation */
+        x264_macroblock_mc( h, mb, 0 );
+    }
+    /* encode the 8x8 blocks */
+    x264_mb_encode_8x8( h, mb, !IS_INTRA( mb->i_type ), i_qscale );
+
+    /* fix the pred mode value */
+    if( IS_INTRA( mb->i_type ) )
+    {
+        mb->i_chroma_pred_mode = x264_mb_pred_mode8x8_fix( mb->i_chroma_pred_mode );
+    }
+
+    /* Calculate the Luma/Chroma patern and non_zero_count */
+    if( mb->i_type == I_16x16 )
+    {
+        mb->i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = array_non_zero_count( mb->block[i].residual_ac, 15 );
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                mb->i_cbp_luma = 0x0f;
+            }
+        }
+    }
+    else
+    {
+        mb->i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = array_non_zero_count( mb->block[i].luma4x4, 16 );
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                mb->i_cbp_luma |= 1 << (i/4);
+            }
+        }
+    }
+
+    /* Calculate the chroma patern */
+    mb->i_cbp_chroma = 0x00;
+    for( i = 0; i < 8; i++ )
+    {
+        mb->block[16+i].i_non_zero_count = array_non_zero_count( mb->block[16+i].residual_ac, 15 );
+        if( mb->block[16+i].i_non_zero_count > 0 )
+        {
+            mb->i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
+        }
+    }
+    if( mb->i_cbp_chroma == 0x00 &&
+        ( array_non_zero_count( mb->chroma_dc[0], 4 ) > 0 || array_non_zero_count( mb->chroma_dc[1], 4 ) ) > 0 )
+    {
+        mb->i_cbp_chroma = 0x01;    /* dc only */
+    }
+
+    /* Check for P_SKIP
+     * XXX: in the me perhaps we should take x264_macroblock_predict_mv_pskip into account
+     *      (if multiple mv give same result)*/
+    if( mb->i_type == P_L0 && mb->i_partition == D_16x16 &&
+        mb->i_cbp_luma == 0x00 && mb->i_cbp_chroma == 0x00 )
+    {
+
+        int i_ref;
+        int mvx, mvy;
+        x264_macroblock_partition_get( mb, 0, 0, 0, &i_ref, &mvx, &mvy );
+
+        if( i_ref == 0 )
+        {
+            int mvxp, mvyp;
+
+            x264_macroblock_predict_mv_pskip( mb, &mvxp, &mvyp );
+            if( mvxp == mvx && mvyp == mvy )
+            {
+                mb->i_type = P_SKIP;
+            }
+        }
+    }
+}
+
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+/****************************************************************************
+ * block_residual_write:
+ ****************************************************************************/
+static void block_residual_write( x264_t *h, bs_t *s, x264_macroblock_t *mb, int i_idx, int *l, int i_count )
+{
+    int level[16], run[16];
+    int i_total, i_trailing;
+    int i_total_zero;
+    int i_last;
+    unsigned int i_sign;
+
+    int i;
+    int i_zero_left;
+    int i_suffix_length;
+
+    /* first find i_last */
+    i_last = i_count - 1;
+    while( i_last >= 0 && l[i_last] == 0 )
+    {
+        i_last--;
+    }
+
+    i_sign = 0;
+    i_total = 0;
+    i_trailing = 0;
+    i_total_zero = 0;
+
+    if( i_last >= 0 )
+    {
+        int b_trailing = 1;
+        int idx = 0;
+
+        /* level and run and total */
+        while( i_last >= 0 )
+        {
+            level[idx] = l[i_last--];
+
+            run[idx] = 0;
+            while( i_last >= 0 && l[i_last] == 0 )
+            {
+                run[idx]++;
+                i_last--;
+            }
+
+            i_total++;
+            i_total_zero += run[idx];
+
+            if( b_trailing && abs( level[idx] ) == 1 && i_trailing < 3 )
+            {
+                i_sign <<= 1;
+                if( level[idx] < 0 )
+                {
+                    i_sign |= 0x01;
+                }
+
+                i_trailing++;
+            }
+            else
+            {
+                b_trailing = 0;
+            }
+
+            idx++;
+        }
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        bs_write_vlc( s, x264_coeff_token[4][i_total][i_trailing] );
+    }
+    else
+    {
+        /* predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = predict_non_zero_code( h, mb, 0 );
+        }
+        else
+        {
+            nC = predict_non_zero_code( h, mb, i_idx );
+        }
+
+        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total][i_trailing] );
+    }
+
+    if( i_total <= 0 )
+    {
+        return;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+    if( i_trailing > 0 )
+    {
+        bs_write( s, i_trailing, i_sign );
+    }
+    for( i = i_trailing; i < i_total; i++ )
+    {
+        int i_level_code;
+
+        /* calculate level code */
+        if( level[i] < 0 )
+        {
+            i_level_code = -2*level[i] - 1;
+        }
+        else /* if( level[i] > 0 ) */
+        {
+            i_level_code = 2 * level[i] - 2;
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code -=2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        }
+
+        if( ( i_level_code >> i_suffix_length ) < 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[i_level_code >> i_suffix_length] );
+            if( i_suffix_length > 0 )
+            {
+                bs_write( s, i_suffix_length, i_level_code );
+            }
+        }
+        else if( i_suffix_length == 0 && i_level_code < 30 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, 4, i_level_code - 14 );
+        }
+        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, i_suffix_length, i_level_code );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_level_prefix[15] );
+            i_level_code -= 15 << i_suffix_length;
+            if( i_suffix_length == 0 )
+            {
+                i_level_code -= 15;
+            }
+
+            if( i_level_code >= ( 1 << 12 ) || i_level_code < 0 )
+            {
+                fprintf( stderr, "OVERFLOW levelcode=%d\n", i_level_code );
+            }
+
+            bs_write( s, 12, i_level_code );    /* check overflow ?? */
+        }
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        }
+    }
+
+    for( i = 0, i_zero_left = i_total_zero; i < i_total - 1; i++ )
+    {
+        int i_zl;
+
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+
+        i_zl = X264_MIN( i_zero_left - 1, 6 );
+
+        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
+
+        i_zero_left -= run[i];
+    }
+}
+
+
+
+
+/*****************************************************************************
+ * x264_macroblock_write:
+ *****************************************************************************/
+void x264_macroblock_write( x264_t *h, bs_t *s, int i_slice_type, x264_macroblock_t *mb )
+{
+    int i;
+    int i_mb_i_offset;
+    int b_sub_ref0 = 1;
+    /* int b_sub_ref1 = 1; */
+
+    switch( i_slice_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_i_offset = 23 + 5;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return;
+    }
+
+    /* PCM special block type UNTESTED */
+    if( mb->i_type == I_PCM )
+    {
+        bs_write_ue( s, i_mb_i_offset + 25 );   /* I_PCM */
+        bs_align_0( s );
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[0][mb->i_mb_y * 16 * h->picture->i_stride[0] + mb->i_mb_x * 16+i] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[1][mb->i_mb_y * 8 * h->picture->i_stride[1] + mb->i_mb_x * 8+i] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[2][mb->i_mb_y * 8 * h->picture->i_stride[2] + mb->i_mb_x * 8+i] );
+        }
+
+        for( i = 0; i < 16 + 8; i++ )
+        {
+            /* special case */
+            mb->block[i].i_non_zero_count = 16;
+        }
+        return;
+    }
+
+    if( mb->i_type == I_4x4 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 0 );    /* I_4x4 */
+    }
+    else if( mb->i_type == I_16x16 )
+    {
+        int i_type = 1 + mb->i_intra16x16_pred_mode + mb->i_cbp_chroma * 4 + ( mb->i_cbp_luma == 0 ? 0 : 12 );
+
+        bs_write_ue( s, i_mb_i_offset + i_type );
+    }
+    else if( mb->i_type == P_L0 )
+    {
+        if( mb->i_partition == D_16x16 )
+        {
+            bs_write_ue( s, 0 );
+        }
+        else if( mb->i_partition == D_16x8 )
+        {
+            bs_write_ue( s, 1 );
+        }
+        else if( mb->i_partition == D_8x16 )
+        {
+            bs_write_ue( s, 2 );
+        }
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        if( mb->partition[0][0].i_ref[0] == 0 &&
+            mb->partition[0][2].i_ref[0] == 0 &&
+            mb->partition[2][0].i_ref[0] == 0 &&
+            mb->partition[2][2].i_ref[0] == 0 )
+        {
+            b_sub_ref0 = 0;
+            bs_write_ue( s, 4 );    /* P_8x8ref0 */
+        }
+        else
+        {
+            b_sub_ref0 = 1;
+            bs_write_ue( s, 3 );
+        }
+    }
+    else
+    {
+        /* TODO B type */
+    }
+
+    if( IS_INTRA( mb->i_type ) )
+    {
+        /* Prediction */
+        if( mb->i_type == I_4x4 )
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                int i_predicted_mode = predict_pred_intra4x4_mode( h, mb, i );
+                int i_mode = mb->block[i].i_intra4x4_pred_mode;
+
+                if( i_predicted_mode == i_mode)
+                {
+                    bs_write( s, 1, 1 );  /* b_prev_intra4x4_pred_mode */
+                }
+                else
+                {
+                    bs_write( s, 1, 0 );  /* b_prev_intra4x4_pred_mode */
+                    if( i_mode < i_predicted_mode )
+                    {
+                        bs_write( s, 3, i_mode );
+                    }
+                    else
+                    {
+                        bs_write( s, 3, i_mode - 1 );
+                    }
+                }
+            }
+        }
+        bs_write_ue( s, mb->i_chroma_pred_mode );
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            switch( mb->i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    bs_write_ue( s, 0 );
+                    break;
+                case D_L0_8x4:
+                    bs_write_ue( s, 1 );
+                    break;
+                case D_L0_4x8:
+                    bs_write_ue( s, 2 );
+                    break;
+                case D_L0_4x4:
+                    bs_write_ue( s, 3 );
+                    break;
+            }
+        }
+        /* ref0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
+        {
+            for( i = 0; i < 4; i++ )
+            {
+                int i_ref;
+                x264_macroblock_partition_get( mb, 0, i, 0, &i_ref, NULL, NULL );
+
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+            }
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int i_part;
+            for( i_part = 0; i_part < mb_sub_partition_count( mb->i_sub_partition[i] ); i_part++ )
+            {
+                int mvx, mvy;
+                int mvxp, mvyp;
+
+                x264_macroblock_partition_get( mb, 0, i, i_part, NULL, &mvx, &mvy );
+                x264_macroblock_predict_mv( mb, 0, i, i_part, &mvxp, &mvyp );
+
+                bs_write_se( s, mvx - mvxp );
+                bs_write_se( s, mvy - mvyp);
+            }
+        }
+    }
+    else if( mb->i_type == B_8x8 )
+    {
+        /* TODO for B-frame (merge it with P_8x8 ?)*/
+    }
+    else if( mb->i_type != B_DIRECT )
+    {
+        /* FIXME -> invalid for B frame */
+
+        /* Motion Vector */
+        int i_part = 1 + ( mb->i_partition != D_16x16 ? 1 : 0 );
+
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                if( mb->i_type == P_L0 )    /* fixme B-frame */
+                {
+                    int i_ref;
+                    x264_macroblock_partition_get( mb, 0, i, 0, &i_ref, NULL, NULL );
+                    bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, i_ref ); /* -1 is correct ? */
+                }
+            }
+        }
+        if( h->sh.i_num_ref_idx_l1_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                /* ref idx part L1 TODO when needed */
+            }
+        }
+
+        for( i = 0; i < i_part; i++ )
+        {
+            if( mb->i_type == P_L0 )
+            {
+                int mvx, mvy;
+                int mvxp, mvyp;
+
+                x264_macroblock_partition_get( mb, 0, i, 0, NULL, &mvx, &mvy );
+                x264_macroblock_predict_mv( mb, 0, i, 0, &mvxp, &mvyp );
+
+                bs_write_se( s, mvx - mvxp );
+                bs_write_se( s, mvy - mvyp);
+            }
+        }
+        /* Same for L1 for B frame */
+    }
+
+    if( mb->i_type != I_16x16 )
+    {
+        if( mb->i_type == I_4x4 )
+        {
+            bs_write_ue( s, intra4x4_cbp_to_golomb[( mb->i_cbp_chroma << 4 )|mb->i_cbp_luma] );
+        }
+        else
+        {
+            bs_write_ue( s, inter_cbp_to_golomb[( mb->i_cbp_chroma << 4 )|mb->i_cbp_luma] );
+        }
+    }
+
+    if( mb->i_cbp_luma > 0 || mb->i_cbp_chroma > 0 || mb->i_type == I_16x16 )
+    {
+        bs_write_se( s, mb->i_qp_delta );
+
+        /* write residual */
+        if( mb->i_type == I_16x16 )
+        {
+            /* DC Luma */
+            block_residual_write( h, s, mb, BLOCK_INDEX_LUMA_DC , mb->luma16x16_dc, 16 );
+
+            if( mb->i_cbp_luma != 0 )
+            {
+                /* AC Luma */
+                for( i = 0; i < 16; i++ )
+                {
+                    block_residual_write( h, s, mb, i, mb->block[i].residual_ac, 15 );
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                if( mb->i_cbp_luma & ( 1 << ( i / 4 ) ) )
+                {
+                    block_residual_write( h, s, mb, i, mb->block[i].luma4x4, 16 );
+                }
+            }
+        }
+
+        if( mb->i_cbp_chroma &0x03 )    /* Chroma DC residual present */
+        {
+            block_residual_write( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[0], 4 );
+            block_residual_write( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[1], 4 );
+        }
+        if( mb->i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write( h, s, mb, 16 + i, mb->block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
+
diff --git a/testing/macroblock-dz.c b/testing/macroblock-dz.c
new file mode 100644
index 00000000..719edcbb
--- /dev/null
+++ b/testing/macroblock-dz.c
@@ -0,0 +1,2266 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock-dz.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "macroblock.h"
+
+static const uint8_t intra4x4_cbp_to_golomb[48]=
+{
+  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+ 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+ 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
+};
+static const uint8_t inter_cbp_to_golomb[48]=
+{
+  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+};
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int quant_mf[6][4][4] =
+{
+    {  { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243},
+       { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}  },
+    {  { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660},
+       { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}  },
+    {  { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194},
+       { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}  },
+    {  {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647},
+       {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}  },
+    {  {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355},
+       {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}  },
+    {  {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893},
+       {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}  }
+};
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+static const int f_deadzone_intra[4][4][2] = /* [num][den] */
+{
+    { {1,2}, {3,7}, {2,5}, {1,3} },
+    { {3,7}, {2,5}, {1,3}, {1,4} },
+    { {2,5}, {1,3}, {1,4}, {1,5} },
+    { {1,3}, {1,4}, {1,5}, {1,5} }
+};
+static const int f_deadzone_inter[4][4][2] = /* [num][den] */
+{
+    { {1,3}, {2,7}, {4,15},{2,9} },
+    { {2,7}, {4,15},{2,9}, {1,6} },
+    { {4,15},{2,9}, {1,6}, {1,7} },
+    { {2,9}, {1,6}, {1,7}, {2,15} }
+};
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+}
+static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
+{
+    int i;
+
+    for( i = 1; i < 16; i++ )
+    {
+        level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+}
+
+static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[1][1];
+}
+
+#if 0
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+    const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / 3;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / ( b_intra ? 3 : 6 );
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+#endif
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            const int f = b_intra ?
+                          (f_deadzone_intra[y][x][0] * ( 1 << i_qbits ) / f_deadzone_intra[y][x][1])
+                          :
+                          (f_deadzone_inter[y][x][0] * ( 1 << i_qbits ) / f_deadzone_inter[y][x][1]);
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = b_intra ?
+                   (f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1])
+                   :
+                   (f_deadzone_inter[0][0][0] * ( 2 << i_qbits ) / f_deadzone_inter[0][0][1]);
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static inline int array_non_zero_count( int *v, int i_count )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < i_count; i++ )
+    {
+        if( v[i] )
+        {
+            i_nz++;
+        }
+    }
+    return i_nz;
+}
+
+void x264_mb_partition_mvd( x264_macroblock_t *mb, int i_list, int i_part, int i_sub, int mvd[2])
+{
+    int mvp[2];
+
+    int x,  y;
+    int w,  h;
+    int dx, dy;
+
+    x264_mb_partition_getxy( mb, i_part, i_sub, &x, &y );
+    x264_mb_partition_size ( mb, i_part, i_sub, &w, &h );
+    x264_mb_predict_mv(  mb, i_list, i_part, i_sub, mvp );
+
+    mvd[0] = mb->partition[x][y].mv[i_list][0] - mvp[0];
+    mvd[1] = mb->partition[x][y].mv[i_list][1] - mvp[1];
+
+    for( dx = 0; dx < w; dx++ )
+    {
+        for( dy = 0; dy < h; dy++ )
+        {
+            mb->partition[x+dx][y+dy].mvd[i_list][0] = mvd[0];
+            mb->partition[x+dx][y+dy].mvd[i_list][1] = mvd[1];
+        }
+    }
+}
+
+/* (ref: JVT-B118)
+ * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
+ * to 0 (low score means set it to null)
+ * Used in inter macroblock (luma and chroma)
+ *  luma: for a 8x8 block: if score < 4 -> null
+ *        for the complete mb: if score < 6 -> null
+ *  chroma: for the complete mb: if score < 7 -> null
+ */
+static int x264_mb_decimate_score( int *dct, int i_max )
+{
+    static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int i_score = 0;
+    int idx = i_max - 1;
+
+    while( idx >= 0 && dct[idx] == 0 )
+    {
+        idx--;
+    }
+
+    while( idx >= 0 )
+    {
+        int i_run;
+
+        if( abs( dct[idx--] ) > 1 )
+        {
+            return 9;
+        }
+
+        i_run = 0;
+        while( idx >= 0 && dct[idx] == 0 )
+        {
+            idx--;
+            i_run++;
+        }
+        i_score += i_ds_table[i_run];
+    }
+
+    return i_score;
+}
+
+void x264_mb_encode_i4x4( x264_t *h, x264_macroblock_t *mb, int idx, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src = ctx->p_img[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_img[0];
+    int      i_src = ctx->i_img[0];
+    uint8_t *p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+    int      i_dst = ctx->i_fdec[0];
+
+    int16_t luma[4][4];
+    int16_t dct4x4[4][4];
+
+    /* we calculate diff */
+    h->pixf.sub4x4( luma, p_src, i_src, p_dst, i_dst );
+
+    /* calculate dct coeffs */
+    h->dctf.dct4x4( dct4x4, luma );
+    quant_4x4( dct4x4, i_qscale, 1 );
+
+    scan_zigzag_4x4full( mb->block[idx].luma4x4, dct4x4 );
+
+    /* output samples to fdec */
+    x264_mb_dequant_4x4( dct4x4, i_qscale );
+    h->dctf.idct4x4( luma, dct4x4 );
+
+    /* put pixel to fdec */
+    h->pixf.add4x4( p_dst, i_dst, luma );
+}
+
+static void x264_mb_encode_i16x16( x264_t *h, x264_macroblock_t *mb, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src = ctx->p_img[0];
+    int      i_src = ctx->i_img[0];
+    uint8_t *p_dst = ctx->p_fdec[0];
+    int      i_dst = ctx->i_fdec[0];
+
+    int16_t luma[16][4][4];
+    int16_t dct4x4[16+1][4][4];
+
+    int i;
+
+    /* calculate the diff */
+    h->pixf.sub16x16( luma, p_src, i_src, p_dst, i_dst );
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        h->dctf.dct4x4( dct4x4[i+1], luma[i] );
+
+        /* copy dc coeff */
+        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
+
+        quant_4x4( dct4x4[1+i], i_qscale, 1 );
+        scan_zigzag_4x4( mb->block[i].residual_ac, dct4x4[1+i] );
+    }
+
+    h->dctf.dct4x4dc( dct4x4[0], dct4x4[0] );
+    quant_4x4_dc( dct4x4[0], i_qscale );
+    scan_zigzag_4x4full( mb->luma16x16_dc, dct4x4[0] );
+
+    /* output samples to fdec */
+    h->dctf.idct4x4dc( dct4x4[0], dct4x4[0] );
+    x264_mb_dequant_4x4_dc( dct4x4[0], i_qscale );  /* XXX not inversed */
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        x264_mb_dequant_4x4( dct4x4[1+i], i_qscale );
+
+        /* copy dc coeff */
+        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+
+        h->dctf.idct4x4( luma[i], dct4x4[i+1] );
+    }
+    /* put pixels to fdec */
+    h->pixf.add16x16( p_dst, i_dst, luma );
+}
+
+static void x264_mb_encode_8x8( x264_t *h, x264_macroblock_t *mb, int b_inter, int i_qscale )
+{
+    x264_mb_context_t *ctx = mb->context;
+
+    uint8_t *p_src, *p_dst;
+    int      i_src, i_dst;
+
+    int i, ch;
+    int i_decimate_score = 0;
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        int16_t chroma[4][4][4];
+        int16_t dct2x2[2][2];
+        int16_t dct4x4[4][4][4];
+
+        p_src = ctx->p_img[1+ch];
+        i_src = ctx->i_img[1+ch];
+        p_dst = ctx->p_fdec[1+ch];
+        i_dst = ctx->i_fdec[1+ch];
+
+        /* calculate the diff */
+        h->pixf.sub8x8( chroma, p_src, i_src, p_dst, i_dst );
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            h->dctf.dct4x4( dct4x4[i], chroma[i] );
+
+            /* copy dc coeff */
+            dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+
+            quant_4x4( dct4x4[i], i_qscale, b_inter ? 0 : 1 );
+            scan_zigzag_4x4( mb->block[16+i+ch*4].residual_ac, dct4x4[i] );
+
+            i_decimate_score += x264_mb_decimate_score( mb->block[16+i+ch*4].residual_ac, 15 );
+        }
+
+        h->dctf.dct2x2dc( dct2x2, dct2x2 );
+        quant_2x2_dc( dct2x2, i_qscale, b_inter ? 0 : 1  );
+        scan_zigzag_2x2_dc( mb->chroma_dc[ch], dct2x2 );
+
+        if( i_decimate_score < 7 && b_inter )
+        {
+            /* Near null chroma 8x8 block so make it null (bits saving) */
+            for( i = 0; i < 4; i++ )
+            {
+                int x, y;
+                for( x = 0; x < 15; x++ )
+                {
+                    mb->block[16+i+ch*4].residual_ac[x] = 0;
+                }
+                for( x = 0; x < 4; x++ )
+                {
+                    for( y = 0; y < 4; y++ )
+                    {
+                        dct4x4[i][x][y] = 0;
+                    }
+                }
+            }
+        }
+
+        /* output samples to fdec */
+        h->dctf.idct2x2dc( dct2x2, dct2x2 );
+        x264_mb_dequant_2x2_dc( dct2x2, i_qscale );  /* XXX not inversed */
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            x264_mb_dequant_4x4( dct4x4[i], i_qscale );
+
+            /* copy dc coeff */
+            dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+
+            h->dctf.idct4x4( chroma[i], dct4x4[i] );
+        }
+        h->pixf.add8x8( p_dst, i_dst, chroma );
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_encode:
+ *****************************************************************************/
+void x264_macroblock_encode( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_mb_context_t *ctx = mb->context;
+    int i;
+
+    int     i_qscale;
+
+    /* quantification scale */
+    i_qscale = mb->i_qp;
+
+    if( mb->i_type == I_16x16 )
+    {
+        /* do the right prediction */
+        h->predict_16x16[mb->i_intra16x16_pred_mode]( ctx->p_fdec[0], ctx->i_fdec[0] );
+
+        /* encode the 16x16 macroblock */
+        x264_mb_encode_i16x16( h, mb, i_qscale );
+
+        /* fix the pred mode value */
+        mb->i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix[mb->i_intra16x16_pred_mode];
+    }
+    else if( mb->i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            uint8_t *p_dst_by;
+
+            /* Do the right prediction */
+            p_dst_by = ctx->p_fdec[0] + 4 * block_idx_x[i] + 4 * block_idx_y[i] * ctx->i_fdec[0];
+            h->predict_4x4[mb->block[i].i_intra4x4_pred_mode]( p_dst_by, ctx->i_fdec[0] );
+
+            /* encode one 4x4 block */
+            x264_mb_encode_i4x4( h, mb, i, i_qscale );
+
+            /* fix the pred mode value */
+            mb->block[i].i_intra4x4_pred_mode = x264_mb_pred_mode4x4_fix[mb->block[i].i_intra4x4_pred_mode];
+        }
+    }
+    else    /* Inter MB */
+    {
+        int16_t dct4x4[16][4][4];
+
+        int i8x8, i4x4, idx;
+        int i_decimate_mb = 0;
+
+        /* Motion compensation */
+        x264_mb_mc( h, mb );
+
+        for( i8x8 = 0; i8x8 < 4; i8x8++ )
+        {
+            int16_t luma[4][4];
+            int i_decimate_8x8;
+
+            /* encode one 4x4 block */
+            i_decimate_8x8 = 0;
+            for( i4x4 = 0; i4x4 < 4; i4x4++ )
+            {
+                uint8_t *p_src, *p_dst;
+
+                idx = i8x8 * 4 + i4x4;
+
+                p_src = ctx->p_img[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_img[0];
+                p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+
+                /* we calculate diff */
+                h->pixf.sub4x4( luma, p_src, ctx->i_img[0],p_dst, ctx->i_fdec[0] );
+
+                /* calculate dct coeffs */
+                h->dctf.dct4x4( dct4x4[idx], luma );
+                quant_4x4( dct4x4[idx], i_qscale, 0 );
+
+                scan_zigzag_4x4full( mb->block[idx].luma4x4, dct4x4[idx] );
+                i_decimate_8x8 += x264_mb_decimate_score( mb->block[idx].luma4x4, 16 );
+            }
+
+            /* decimate this 8x8 block */
+            i_decimate_mb += i_decimate_8x8;
+            if( i_decimate_8x8 < 4 )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    int x, y;
+                    idx = i8x8 * 4 + i4x4;
+                    for( i = 0; i < 16; i++ )
+                    {
+                        mb->block[idx].luma4x4[i] = 0;
+                    }
+                    for( x = 0; x < 4; x++ )
+                    {
+                        for( y = 0; y < 4; y++ )
+                        {
+                            dct4x4[idx][x][y] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if( i_decimate_mb < 6 )
+        {
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    for( i = 0; i < 16; i++ )
+                    {
+                        mb->block[i8x8 * 4 + i4x4].luma4x4[i] = 0;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
+            {
+                int16_t luma[4][4];
+                /* TODO we could avoid it if we had decimate this 8x8 block */
+                /* output samples to fdec */
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    uint8_t *p_dst;
+
+                    idx = i8x8 * 4 + i4x4;
+
+                    x264_mb_dequant_4x4( dct4x4[idx], i_qscale );
+                    h->dctf.idct4x4( luma, dct4x4[idx] );
+
+                    /* put pixel to fdec */
+                    p_dst = ctx->p_fdec[0] + 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * ctx->i_fdec[0];
+                    h->pixf.add4x4( p_dst, ctx->i_fdec[0], luma );
+                }
+            }
+        }
+    }
+
+    /* encode chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( mb->i_type ) )
+    {
+        /* do the right prediction */
+        h->predict_8x8[mb->i_chroma_pred_mode]( ctx->p_fdec[1], ctx->i_fdec[1] );
+        h->predict_8x8[mb->i_chroma_pred_mode]( ctx->p_fdec[2], ctx->i_fdec[2] );
+    }
+
+    /* encode the 8x8 blocks */
+    x264_mb_encode_8x8( h, mb, !IS_INTRA( mb->i_type ), i_qscale );
+
+    /* fix the pred mode value */
+    if( IS_INTRA( mb->i_type ) )
+    {
+        mb->i_chroma_pred_mode = x264_mb_pred_mode8x8_fix[mb->i_chroma_pred_mode];
+    }
+
+    /* Calculate the Luma/Chroma patern and non_zero_count */
+    if( mb->i_type == I_16x16 )
+    {
+        mb->i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = array_non_zero_count( mb->block[i].residual_ac, 15 );
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                mb->i_cbp_luma = 0x0f;
+            }
+        }
+    }
+    else
+    {
+        mb->i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            mb->block[i].i_non_zero_count = array_non_zero_count( mb->block[i].luma4x4, 16 );
+            if( mb->block[i].i_non_zero_count > 0 )
+            {
+                mb->i_cbp_luma |= 1 << (i/4);
+            }
+        }
+    }
+
+    /* Calculate the chroma patern */
+    mb->i_cbp_chroma = 0x00;
+    for( i = 0; i < 8; i++ )
+    {
+        mb->block[16+i].i_non_zero_count = array_non_zero_count( mb->block[16+i].residual_ac, 15 );
+        if( mb->block[16+i].i_non_zero_count > 0 )
+        {
+            mb->i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
+        }
+    }
+    if( mb->i_cbp_chroma == 0x00 &&
+        ( array_non_zero_count( mb->chroma_dc[0], 4 ) > 0 || array_non_zero_count( mb->chroma_dc[1], 4 ) ) > 0 )
+    {
+        mb->i_cbp_chroma = 0x01;    /* dc only */
+    }
+
+    /* Check for P_SKIP
+     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
+     *      (if multiple mv give same result)*/
+    if( mb->i_type == P_L0 && mb->i_partition == D_16x16 &&
+        mb->i_cbp_luma == 0x00 && mb->i_cbp_chroma == 0x00 )
+    {
+
+        int i_ref;
+        int mvx, mvy;
+        x264_mb_partition_get( mb, 0, 0, 0, &i_ref, &mvx, &mvy );
+
+        if( i_ref == 0 )
+        {
+            int mvp[2];
+
+            x264_mb_predict_mv_pskip( mb, mvp );
+            if( mvp[0] == mvx && mvp[1] == mvy )
+            {
+                mb->i_type = P_SKIP;
+            }
+        }
+    }
+}
+
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+static inline void bs_write_vlc( bs_t *s, vlc_t v )
+{
+    bs_write( s, v.i_size, v.i_bits );
+}
+
+/****************************************************************************
+ * block_residual_write_cavlc:
+ ****************************************************************************/
+static void block_residual_write_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb, int i_idx, int *l, int i_count )
+{
+    int level[16], run[16];
+    int i_total, i_trailing;
+    int i_total_zero;
+    int i_last;
+    unsigned int i_sign;
+
+    int i;
+    int i_zero_left;
+    int i_suffix_length;
+
+    /* first find i_last */
+    i_last = i_count - 1;
+    while( i_last >= 0 && l[i_last] == 0 )
+    {
+        i_last--;
+    }
+
+    i_sign = 0;
+    i_total = 0;
+    i_trailing = 0;
+    i_total_zero = 0;
+
+    if( i_last >= 0 )
+    {
+        int b_trailing = 1;
+        int idx = 0;
+
+        /* level and run and total */
+        while( i_last >= 0 )
+        {
+            level[idx] = l[i_last--];
+
+            run[idx] = 0;
+            while( i_last >= 0 && l[i_last] == 0 )
+            {
+                run[idx]++;
+                i_last--;
+            }
+
+            i_total++;
+            i_total_zero += run[idx];
+
+            if( b_trailing && abs( level[idx] ) == 1 && i_trailing < 3 )
+            {
+                i_sign <<= 1;
+                if( level[idx] < 0 )
+                {
+                    i_sign |= 0x01;
+                }
+
+                i_trailing++;
+            }
+            else
+            {
+                b_trailing = 0;
+            }
+
+            idx++;
+        }
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
+    }
+    else
+    {
+        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = x264_mb_predict_non_zero_code( h, mb, 0 );
+        }
+        else
+        {
+            nC = x264_mb_predict_non_zero_code( h, mb, i_idx );
+        }
+
+        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total*4+i_trailing] );
+    }
+
+    if( i_total <= 0 )
+    {
+        return;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+    if( i_trailing > 0 )
+    {
+        bs_write( s, i_trailing, i_sign );
+    }
+    for( i = i_trailing; i < i_total; i++ )
+    {
+        int i_level_code;
+
+        /* calculate level code */
+        if( level[i] < 0 )
+        {
+            i_level_code = -2*level[i] - 1;
+        }
+        else /* if( level[i] > 0 ) */
+        {
+            i_level_code = 2 * level[i] - 2;
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code -=2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        }
+
+        if( ( i_level_code >> i_suffix_length ) < 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[i_level_code >> i_suffix_length] );
+            if( i_suffix_length > 0 )
+            {
+                bs_write( s, i_suffix_length, i_level_code );
+            }
+        }
+        else if( i_suffix_length == 0 && i_level_code < 30 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, 4, i_level_code - 14 );
+        }
+        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, i_suffix_length, i_level_code );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_level_prefix[15] );
+            i_level_code -= 15 << i_suffix_length;
+            if( i_suffix_length == 0 )
+            {
+                i_level_code -= 15;
+            }
+
+            if( i_level_code >= ( 1 << 12 ) || i_level_code < 0 )
+            {
+                fprintf( stderr, "OVERFLOW levelcode=%d\n", i_level_code );
+            }
+
+            bs_write( s, 12, i_level_code );    /* check overflow ?? */
+        }
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        }
+    }
+
+    for( i = 0, i_zero_left = i_total_zero; i < i_total - 1; i++ )
+    {
+        int i_zl;
+
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+
+        i_zl = X264_MIN( i_zero_left - 1, 6 );
+
+        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
+
+        i_zero_left -= run[i];
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_write:
+ *****************************************************************************/
+void x264_macroblock_write_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    int i_mb_i_offset;
+    int i;
+
+    switch( h->sh.i_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_i_offset = 23;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return;
+    }
+
+    /* Write:
+      - type
+      - prediction
+      - mv */
+    if( mb->i_type == I_PCM )
+    {
+        /* Untested */
+        bs_write_ue( s, i_mb_i_offset + 25 );
+
+        bs_align_0( s );
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[0][mb->i_mb_y * 16 * h->picture->i_stride[0] + mb->i_mb_x * 16+i] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[1][mb->i_mb_y * 8 * h->picture->i_stride[1] + mb->i_mb_x * 8+i] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[2][mb->i_mb_y * 8 * h->picture->i_stride[2] + mb->i_mb_x * 8+i] );
+        }
+
+        for( i = 0; i < 16 + 8; i++ )
+        {
+            /* special case */
+            mb->block[i].i_non_zero_count = 16;
+        }
+        return;
+    }
+    else if( mb->i_type == I_4x4 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 0 );
+
+        /* Prediction: Luma */
+        for( i = 0; i < 16; i++ )
+        {
+            int i_predicted_mode = x264_mb_predict_intra4x4_mode( h, mb, i );
+            int i_mode = mb->block[i].i_intra4x4_pred_mode;
+
+            if( i_predicted_mode == i_mode)
+            {
+                bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
+            }
+            else
+            {
+                bs_write1( s, 0 );  /* b_prev_intra4x4_pred_mode */
+                if( i_mode < i_predicted_mode )
+                {
+                    bs_write( s, 3, i_mode );
+                }
+                else
+                {
+                    bs_write( s, 3, i_mode - 1 );
+                }
+            }
+        }
+        /* Prediction: chroma */
+        bs_write_ue( s, mb->i_chroma_pred_mode );
+    }
+    else if( mb->i_type == I_16x16 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 1 + mb->i_intra16x16_pred_mode +
+                                            mb->i_cbp_chroma * 4 +
+                                            ( mb->i_cbp_luma == 0 ? 0 : 12 ) );
+        /* Prediction: chroma */
+        bs_write_ue( s, mb->i_chroma_pred_mode );
+    }
+    else if( mb->i_type == P_L0 )
+    {
+        int mvp[2];
+
+        if( mb->i_partition == D_16x16 )
+        {
+            bs_write_ue( s, 0 );
+
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][0].i_ref[0] );
+            }
+            x264_mb_predict_mv( mb, 0, 0, 0, mvp );
+            bs_write_se( s, mb->partition[0][0].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[0][0].mv[0][1] - mvp[1] );
+        }
+        else if( mb->i_partition == D_16x8 )
+        {
+            bs_write_ue( s, 1 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][0].i_ref[0] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][2].i_ref[0] );
+            }
+
+            x264_mb_predict_mv( mb, 0, 0, 0, mvp );
+            bs_write_se( s, mb->partition[0][0].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[0][0].mv[0][1] - mvp[1] );
+
+            x264_mb_predict_mv( mb, 0, 1, 0, mvp );
+            bs_write_se( s, mb->partition[0][2].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[0][2].mv[0][1] - mvp[1] );
+        }
+        else if( mb->i_partition == D_8x16 )
+        {
+            bs_write_ue( s, 2 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][0].i_ref[0] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[2][0].i_ref[0] );
+            }
+
+            x264_mb_predict_mv( mb, 0, 0, 0, mvp );
+            bs_write_se( s, mb->partition[0][0].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[0][0].mv[0][1] - mvp[1] );
+
+            x264_mb_predict_mv( mb, 0, 1, 0, mvp );
+            bs_write_se( s, mb->partition[2][0].mv[0][0] - mvp[0] );
+            bs_write_se( s, mb->partition[2][0].mv[0][1] - mvp[1] );
+        }
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        int b_sub_ref0;
+
+        if( mb->partition[0][0].i_ref[0] == 0 &&
+            mb->partition[0][2].i_ref[0] == 0 &&
+            mb->partition[2][0].i_ref[0] == 0 &&
+            mb->partition[2][2].i_ref[0] == 0 )
+        {
+            bs_write_ue( s, 4 );
+            b_sub_ref0 = 0;
+        }
+        else
+        {
+            bs_write_ue( s, 3 );
+            b_sub_ref0 = 1;
+        }
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            switch( mb->i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    bs_write_ue( s, 0 );
+                    break;
+                case D_L0_8x4:
+                    bs_write_ue( s, 1 );
+                    break;
+                case D_L0_4x8:
+                    bs_write_ue( s, 2 );
+                    break;
+                case D_L0_4x4:
+                    bs_write_ue( s, 3 );
+                    break;
+            }
+        }
+        /* ref0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
+        {
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][0].i_ref[0] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[2][0].i_ref[0] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[0][2].i_ref[0] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, mb->partition[2][2].i_ref[0] );
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int i_part;
+            for( i_part = 0; i_part < x264_mb_partition_count_table[mb->i_sub_partition[i]]; i_part++ )
+            {
+                int mvx, mvy;
+                int mvp[2];
+
+                x264_mb_partition_get( mb, 0, i, i_part, NULL, &mvx, &mvy );
+                x264_mb_predict_mv( mb, 0, i, i_part, mvp );
+
+                bs_write_se( s, mvx - mvp[0] );
+                bs_write_se( s, mvy - mvp[1]);
+            }
+        }
+    }
+    else if( mb->i_type == B_8x8 )
+    {
+        fprintf( stderr, "invalid/unhandled mb_type (B_8x8)\n" );
+        return;
+    }
+    else if( mb->i_type != B_DIRECT )
+    {
+        /* All B mode */
+        /* Motion Vector */
+        int i_part = x264_mb_partition_count_table[mb->i_partition];
+        int i_ref;
+        int mvx, mvy;
+        int mvp[2];
+
+        int b_list0[2];
+        int b_list1[2];
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list0[i] = x264_mb_type_list0_table[mb->i_type][i];
+            b_list1[i] = x264_mb_type_list1_table[mb->i_type][i];
+        }
+
+
+        if( mb->i_partition == D_16x16 )
+        {
+            if( b_list0[0] && b_list1[0] )
+            {
+                bs_write_ue( s, 3 );
+            }
+            else if( b_list1[0] )
+            {
+                bs_write_ue( s, 2 );
+            }
+            else
+            {
+                bs_write_ue( s, 1 );
+            }
+        }
+        else
+        {
+            if( mb->i_type == B_BI_BI )
+            {
+                bs_write_ue( s, 20 + (mb->i_partition == D_16x8 ? 0 : 1 ) );
+            }
+            else if( b_list0[0] && b_list1[0] )
+            {
+                /* B_BI_LX* */
+                bs_write_ue( s, 16 + (b_list0[1]?0:2) + (mb->i_partition == D_16x8?0:1) );
+            }
+            else if( b_list0[1] && b_list1[1] )
+            {
+                /* B_LX_BI */
+                bs_write_ue( s, 12 + (b_list0[1]?0:2) + (mb->i_partition == D_16x8?0:1) );
+            }
+            else if( b_list1[1] )
+            {
+                /* B_LX_L1 */
+                bs_write_ue( s, 6 + (b_list0[0]?2:0) + (mb->i_partition == D_16x8?0:1) );
+            }
+            else if( b_list0[1] )
+            {
+                /* B_LX_L0 */
+                bs_write_ue( s, 4 + (b_list0[0]?0:6) + (mb->i_partition == D_16x8?0:1) );
+            }
+        }
+
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                if( b_list0[i] )
+                {
+                    x264_mb_partition_get( mb, 0, i, 0, &i_ref, NULL, NULL );
+                    bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+                }
+            }
+        }
+        if( h->sh.i_num_ref_idx_l1_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                if( b_list1[i] )
+                {
+                    x264_mb_partition_get( mb, 1, i, 0, &i_ref, NULL, NULL );
+                    bs_write_te( s, h->sh.i_num_ref_idx_l1_active - 1, i_ref );
+                }
+            }
+        }
+
+        for( i = 0; i < i_part; i++ )
+        {
+            if( b_list0[i] )
+            {
+                x264_mb_partition_get( mb, 0, i, 0, NULL, &mvx, &mvy );
+                x264_mb_predict_mv( mb, 0, i, 0, mvp );
+
+                bs_write_se( s, mvx - mvp[0] );
+                bs_write_se( s, mvy - mvp[1] );
+            }
+        }
+        for( i = 0; i < i_part; i++ )
+        {
+            if( b_list1[i] )
+            {
+                x264_mb_partition_get( mb, 1, i, 0, NULL, &mvx, &mvy );
+                x264_mb_predict_mv( mb, 1, i, 0, mvp );
+
+                bs_write_se( s, mvx - mvp[0] );
+                bs_write_se( s, mvy - mvp[1] );
+            }
+        }
+    }
+    else if( mb->i_type == B_DIRECT )
+    {
+        bs_write_ue( s, 0 );
+    }
+    else
+    {
+        fprintf( stderr, "invalid/unhandled mb_type\n" );
+        return;
+    }
+
+    /* Coded block patern */
+    if( mb->i_type == I_4x4 )
+    {
+        bs_write_ue( s, intra4x4_cbp_to_golomb[( mb->i_cbp_chroma << 4 )|mb->i_cbp_luma] );
+    }
+    else if( mb->i_type != I_16x16 )
+    {
+        bs_write_ue( s, inter_cbp_to_golomb[( mb->i_cbp_chroma << 4 )|mb->i_cbp_luma] );
+    }
+
+    /* write residual */
+    if( mb->i_type == I_16x16 )
+    {
+        if( mb->i_mb_x > 0 || mb->i_mb_y > 0 )
+            bs_write_se( s, mb->i_qp - (mb-1)->i_qp);
+        else
+            bs_write_se( s, mb->i_qp - h->pps->i_pic_init_qp - h->sh.i_qp_delta );
+
+        /* DC Luma */
+        block_residual_write_cavlc( h, s, mb, BLOCK_INDEX_LUMA_DC , mb->luma16x16_dc, 16 );
+
+        if( mb->i_cbp_luma != 0 )
+        {
+            /* AC Luma */
+            for( i = 0; i < 16; i++ )
+            {
+                block_residual_write_cavlc( h, s, mb, i, mb->block[i].residual_ac, 15 );
+            }
+        }
+    }
+    else if( mb->i_cbp_luma != 0 || mb->i_cbp_chroma != 0 )
+    {
+        bs_write_se( s, mb->i_qp - h->pps->i_pic_init_qp - h->sh.i_qp_delta );
+
+        for( i = 0; i < 16; i++ )
+        {
+            if( mb->i_cbp_luma & ( 1 << ( i / 4 ) ) )
+            {
+                block_residual_write_cavlc( h, s, mb, i, mb->block[i].luma4x4, 16 );
+            }
+        }
+    }
+    if( mb->i_cbp_chroma != 0 )
+    {
+        /* Chroma DC residual present */
+        block_residual_write_cavlc( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[0], 4 );
+        block_residual_write_cavlc( h, s, mb, BLOCK_INDEX_CHROMA_DC, mb->chroma_dc[1], 4 );
+        if( mb->i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cavlc( h, s, mb, 16 + i, mb->block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
+
+/*****************************************************************************
+ *
+ * Cabac stuff
+ *
+ *****************************************************************************/
+
+static void x264_cabac_mb_type( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_macroblock_t *mba = mb->context->mba;
+    x264_macroblock_t *mbb = mb->context->mbb;
+    int i_ctxIdxInc    = 0;
+
+    if( h->sh.i_type == SLICE_TYPE_I )
+    {
+
+        if( mba != NULL && mba->i_type != I_4x4 )
+        {
+            i_ctxIdxInc++;
+        }
+        if( mbb != NULL && mbb->i_type != I_4x4 )
+        {
+            i_ctxIdxInc++;
+        }
+
+        if( mb->i_type == I_4x4 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + i_ctxIdxInc, 0 );
+        }
+        else if( mb->i_type == I_PCM )
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + i_ctxIdxInc, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 1 ); /*ctxIdx == 276 */
+        }
+        else    /* I_16x16 */
+        {
+            x264_cabac_encode_decision( &h->cabac, 3 + i_ctxIdxInc, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 0 ); /*ctxIdx == 276 */
+
+            x264_cabac_encode_decision( &h->cabac, 3 + 3, ( mb->i_cbp_luma == 0 ? 0 : 1 ));
+            if( mb->i_cbp_chroma == 0 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 3 + 4, 0 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 3 + 4, 1 );
+                x264_cabac_encode_decision( &h->cabac, 3 + 5, ( mb->i_cbp_chroma == 1 ? 0 : 1 ) );
+            }
+            x264_cabac_encode_decision( &h->cabac, 3 + 6, ( (mb->i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+            x264_cabac_encode_decision( &h->cabac, 3 + 7, ( (mb->i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        }
+    }
+    else if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        /* prefix: 14, suffix: 17 */
+        if( mb->i_type == P_L0 )
+        {
+            if( mb->i_partition == D_16x16 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 0 );
+                x264_cabac_encode_decision( &h->cabac, 16, 0 );
+            }
+            else if( mb->i_partition == D_16x8 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            }
+            else if( mb->i_partition == D_8x16 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 14, 0 );
+                x264_cabac_encode_decision( &h->cabac, 15, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17, 0 );
+            }
+        }
+        else if( mb->i_type == P_8x8 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 14, 0 );
+            x264_cabac_encode_decision( &h->cabac, 15, 0 );
+            x264_cabac_encode_decision( &h->cabac, 16, 1 );
+        }
+        else if( mb->i_type == I_4x4 )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            x264_cabac_encode_decision( &h->cabac, 17, 0 );
+        }
+        else if( mb->i_type == I_PCM )
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 1 ); /*ctxIdx == 276 */
+        }
+        else /* intra 16x16 */
+        {
+            /* prefix */
+            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+
+            /* suffix */
+            x264_cabac_encode_decision( &h->cabac, 17, 1 );
+            x264_cabac_encode_terminal( &h->cabac, 0 ); /*ctxIdx == 276 */
+
+            x264_cabac_encode_decision( &h->cabac, 17+1, ( mb->i_cbp_luma == 0 ? 0 : 1 ));
+            if( mb->i_cbp_chroma == 0 )
+            {
+                x264_cabac_encode_decision( &h->cabac, 17+2, 0 );
+            }
+            else
+            {
+                x264_cabac_encode_decision( &h->cabac, 17+2, 1 );
+                x264_cabac_encode_decision( &h->cabac, 17+2, ( mb->i_cbp_chroma == 1 ? 0 : 1 ) );
+            }
+            x264_cabac_encode_decision( &h->cabac, 17+3, ( (mb->i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+            x264_cabac_encode_decision( &h->cabac, 17+3, ( (mb->i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        }
+    }
+    else
+    {
+        fprintf( stderr, "SLICE_TYPE_B unsupported in x264_macroblock_write_cabac\n" );
+        return;
+    }
+}
+
+static void x264_cabac_mb_intra4x4_pred_mode( x264_t *h, x264_macroblock_t *mb, int i_pred, int i_mode )
+{
+    if( i_pred == i_mode )
+    {
+        /* b_prev_intra4x4_pred_mode */
+        x264_cabac_encode_decision( &h->cabac, 68, 1 );
+    }
+    else
+    {
+        /* b_prev_intra4x4_pred_mode */
+        x264_cabac_encode_decision( &h->cabac, 68, 0 );
+        if( i_mode > i_pred  )
+        {
+            i_mode--;
+        }
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode     )&0x01 );
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 1)&0x01 );
+        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 2)&0x01 );
+    }
+}
+static void x264_cabac_mb_intra8x8_pred_mode( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_macroblock_t *mba = mb->context->mba;
+    x264_macroblock_t *mbb = mb->context->mbb;
+
+    int i_ctxIdxInc    = 0;
+
+    if( mba != NULL && ( mba->i_type == I_4x4 || mba->i_type == I_16x16 ) && mba->i_chroma_pred_mode != 0 )
+    {
+        i_ctxIdxInc++;
+    }
+    if( mbb != NULL && ( mbb->i_type == I_4x4 || mbb->i_type == I_16x16 ) && mbb->i_chroma_pred_mode != 0 )
+    {
+        i_ctxIdxInc++;
+    }
+    if( mb->i_chroma_pred_mode == 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 64 + i_ctxIdxInc, 0 );
+    }
+    else
+    {
+        x264_cabac_encode_decision( &h->cabac, 64 + i_ctxIdxInc, 1 );
+        x264_cabac_encode_decision( &h->cabac, 64 + 3, ( mb->i_chroma_pred_mode == 1 ? 0 : 1 ) );
+        if( mb->i_chroma_pred_mode > 1 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 64 + 3, ( mb->i_chroma_pred_mode == 2 ? 0 : 1 ) );
+        }
+    }
+}
+
+static void x264_cabac_mb_cbp_luma( x264_t *h, x264_macroblock_t *mb )
+{
+    int idx;
+    x264_macroblock_t *mba;
+    x264_macroblock_t *mbb;
+
+    for( idx = 0;idx < 16; idx+=4 )
+    {
+        int i_ctxIdxInc;
+        int i8x8a, i8x8b;
+        int x, y;
+
+        mba = mb->context->block[idx].mba;
+        mbb = mb->context->block[idx].mbb;
+
+        x = block_idx_x[idx]; y = block_idx_y[idx];
+
+        i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+        i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+
+        i_ctxIdxInc = 0;
+        if( mba != NULL && mba->i_type != I_PCM &&
+           ( IS_SKIP( mba->i_type ) || ((mba->i_cbp_luma >> i8x8a)&0x01) == 0 ) )
+        {
+            i_ctxIdxInc++;
+        }
+        if( mbb != NULL && mbb->i_type != I_PCM &&
+           ( IS_SKIP( mbb->i_type ) || ((mbb->i_cbp_luma >> i8x8b)&0x01) == 0 ) )
+        {
+            i_ctxIdxInc += 2;
+        }
+        x264_cabac_encode_decision( &h->cabac, 73 + i_ctxIdxInc, (mb->i_cbp_luma  >> (idx/4))&0x01 );
+    }
+}
+
+static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_macroblock_t *mba = mb->context->mba;
+    x264_macroblock_t *mbb = mb->context->mbb;
+    int i_ctxIdxInc = 0;
+
+    if( mba != NULL && !IS_SKIP( mba->i_type ) &&
+        ( mba->i_type == I_PCM || mba->i_cbp_chroma != 0 ) )
+    {
+        i_ctxIdxInc++;
+    }
+    if( mbb != NULL && !IS_SKIP( mbb->i_type ) &&
+        ( mbb->i_type == I_PCM || mbb->i_cbp_chroma != 0 ) )
+    {
+        i_ctxIdxInc += 2;
+    }
+    x264_cabac_encode_decision( &h->cabac, 77 + i_ctxIdxInc, (mb->i_cbp_chroma > 0 ? 1 : 0) );
+    if( mb->i_cbp_chroma > 0 )
+    {
+        i_ctxIdxInc = 4;
+        if( mba != NULL && !IS_SKIP( mba->i_type ) &&
+            ( mba->i_type == I_PCM || mba->i_cbp_chroma == 2 ) )
+        {
+            i_ctxIdxInc++;
+        }
+        if( mbb != NULL && !IS_SKIP( mbb->i_type ) &&
+            ( mbb->i_type == I_PCM || mbb->i_cbp_chroma == 2 ) )
+        {
+            i_ctxIdxInc += 2;
+        }
+        x264_cabac_encode_decision( &h->cabac, 77 + i_ctxIdxInc, (mb->i_cbp_chroma > 1 ? 1 : 0) );
+    }
+}
+
+/* TODO check it with != qp per mb */
+static void x264_cabac_mb_qp_delta( x264_t *h, x264_macroblock_t *mb )
+{
+    x264_macroblock_t *mbp = NULL;
+    int i_slice_qp =  h->pps->i_pic_init_qp + h->sh.i_qp_delta;
+    int i_last_dqp = 0;
+    int i_ctxIdxInc = 0;
+    int val;
+
+    if( mb->i_mb_x > 0 || mb->i_mb_y > 0 )
+    {
+        mbp = mb - 1;
+        if( mbp->i_mb_x > 0 || mbp->i_mb_y > 0 )
+        {
+            i_last_dqp = mbp->i_qp - (mbp-1)->i_qp;
+        }
+        else
+        {
+            i_last_dqp = mbp->i_qp - i_slice_qp;
+        }
+    }
+
+    if( mbp != NULL &&
+        !IS_SKIP( mbp->i_type ) && mbp->i_type != I_PCM &&
+        i_last_dqp != 0 &&
+        ( mbp->i_type == I_16x16 || mbp->i_cbp_luma != 0 || mbp->i_cbp_chroma != 0 ) )
+    {
+        i_ctxIdxInc = 1;
+    }
+    if( mbp )
+        val = (mb->i_qp - mbp->i_qp) <= 0 ? (-2*(mb->i_qp - mbp->i_qp)) : (2*(mb->i_qp - mbp->i_qp)-1);
+    else
+        val = (mb->i_qp - i_slice_qp) <= 0 ? (-2*(mb->i_qp -i_slice_qp)) : (2*(mb->i_qp - i_slice_qp)-1);
+
+    while( val > 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac,  60 + i_ctxIdxInc, 1 );
+        if( i_ctxIdxInc < 2 )
+        {
+            i_ctxIdxInc = 2;
+        }
+        else
+        {
+            i_ctxIdxInc = 3;
+        }
+        val--;
+    }
+    x264_cabac_encode_decision( &h->cabac,  60 + i_ctxIdxInc, 0 );
+}
+
+static int x264_cabac_mb_cbf_ctxidxinc( x264_macroblock_t *mb, int i_ctxBlockCat, int i_idx )
+{
+    x264_mb_context_t *ctx = mb->context;
+    x264_macroblock_t *a = NULL;
+    x264_macroblock_t *b = NULL;
+    int i_nza = -1;
+    int i_nzb = -1;
+
+    int i_ctxIdxInc = 0;
+
+    if( i_ctxBlockCat == 0 )
+    {
+        a = ctx->mba;
+        b = ctx->mbb;
+
+        if( a !=NULL && a->i_type == I_16x16 )
+        {
+            i_nza = array_non_zero_count( a->luma16x16_dc, 16 );
+        }
+        if( b !=NULL && b->i_type == I_16x16 )
+        {
+            i_nzb = array_non_zero_count( b->luma16x16_dc, 16 );
+        }
+    }
+    else if( i_ctxBlockCat == 1 || i_ctxBlockCat == 2 )
+    {
+        int i8x8a, i8x8b;
+        int x, y;
+
+        a = ctx->block[i_idx].mba;
+        b = ctx->block[i_idx].mbb;
+
+        x = block_idx_x[i_idx];
+        y = block_idx_y[i_idx];
+
+        i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+        i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+
+        /* FIXME is &0x01 correct ? */
+        if( a != NULL && !IS_SKIP( a->i_type ) && a->i_type != I_PCM &&
+            ((a->i_cbp_luma >> i8x8a)) != 0 )
+        {
+            i_nza = ctx->block[i_idx].bka->i_non_zero_count;
+        }
+        if( b != NULL && !IS_SKIP( b->i_type ) && b->i_type != I_PCM &&
+            ((b->i_cbp_luma >>i8x8b)) != 0 )
+        {
+            i_nzb = ctx->block[i_idx].bkb->i_non_zero_count;
+        }
+    }
+    else if( i_ctxBlockCat == 3 )
+    {
+        a = ctx->mba;
+        b = ctx->mbb;
+
+        if( a != NULL && !IS_SKIP( a->i_type ) && a->i_type != I_PCM &&
+            a->i_cbp_chroma != 0 )
+        {
+            i_nza = array_non_zero_count( a->chroma_dc[i_idx], 4 );
+        }
+        if( b != NULL && !IS_SKIP( b->i_type ) && b->i_type != I_PCM &&
+            b->i_cbp_chroma != 0 )
+        {
+            i_nzb = array_non_zero_count( b->chroma_dc[i_idx], 4 );
+        }
+    }
+    else if( i_ctxBlockCat == 4 )
+    {
+        a = ctx->block[16+i_idx].mba;
+        b = ctx->block[16+i_idx].mbb;
+
+        if( a != NULL && !IS_SKIP( a->i_type ) && a->i_type != I_PCM &&
+            a->i_cbp_chroma == 2 )
+        {
+            i_nza = ctx->block[16+i_idx].bka->i_non_zero_count;
+        }
+        if( b != NULL && !IS_SKIP( b->i_type ) && b->i_type != I_PCM &&
+            b->i_cbp_chroma == 2 )
+        {
+            i_nzb = ctx->block[16+i_idx].bkb->i_non_zero_count;
+        }
+    }
+
+    if( ( a == NULL && IS_INTRA( mb->i_type ) ) || ( a != NULL && a->i_type == I_PCM ) || i_nza > 0 )
+    {
+        i_ctxIdxInc++;
+    }
+    if( ( b == NULL && IS_INTRA( mb->i_type ) ) || ( b != NULL && b->i_type == I_PCM ) || i_nzb > 0 )
+    {
+        i_ctxIdxInc += 2;
+    }
+
+    return i_ctxIdxInc + 4 * i_ctxBlockCat;
+}
+
+void x264_cabac_mb_skip( x264_t *h, x264_macroblock_t *mb, int b_skip )
+{
+    x264_macroblock_t *mba = mb->context->mba;
+    x264_macroblock_t *mbb = mb->context->mbb;
+    int i_ctxIdxInc = 0;
+
+    if( mba != NULL && !IS_SKIP( mba->i_type ) )
+    {
+        i_ctxIdxInc++;
+    }
+    if( mbb != NULL && !IS_SKIP( mbb->i_type ) )
+    {
+        i_ctxIdxInc++;
+    }
+
+    if( h->sh.i_type == SLICE_TYPE_P )
+    {
+        x264_cabac_encode_decision( &h->cabac, 11 + i_ctxIdxInc, b_skip ? 1 : 0 );
+    }
+    else /* SLICE_TYPE_B */
+    {
+        x264_cabac_encode_decision( &h->cabac, 24 + i_ctxIdxInc, b_skip ? 1 : 0 );
+    }
+}
+
+static void x264_cabac_mb_ref( x264_t *h, x264_macroblock_t *mb, int i_list, int i_part )
+{
+    x264_macroblock_t *a;
+    x264_macroblock_t *b;
+
+    int i_ctxIdxInc = 0;
+    int i_ref;
+    int i_refa = -1;
+    int i_refb = -1;
+
+    int x, y, xn, yn;
+
+    x264_mb_partition_getxy( mb, i_part, 0, &x, &y );
+    i_ref = mb->partition[x][y].i_ref[i_list];
+
+
+    /* Left  pixel (-1,0)*/
+    xn = x - 1;
+    a = mb;
+    if( xn < 0 )
+    {
+        xn += 4;
+        a = mb->context->mba;
+    }
+    if( a && !IS_INTRA( a->i_type ) )
+    {
+        i_refa = a->partition[xn][y].i_ref[i_list];
+    }
+
+    /* Up ( pixel(0,-1)*/
+    yn = y - 1;
+    b = mb;
+    if( yn < 0 )
+    {
+        yn += 4;
+        b = mb->context->mbb;
+    }
+    if( b && !IS_INTRA( b->i_type ) )
+    {
+        i_refb = b->partition[x][yn].i_ref[i_list];
+    }
+
+    /* FIXME not complete for B frame (B_DIRECT and B_DIRECT 8x8 sub */
+    if( i_refa > 0 && !IS_SKIP( a->i_type ) )
+    {
+        i_ctxIdxInc++;
+    }
+    if( i_refb > 0 && !IS_SKIP( b->i_type ) )
+    {
+        i_ctxIdxInc += 2;
+    }
+
+    while( i_ref > 0 )
+    {
+        x264_cabac_encode_decision( &h->cabac, 54 + i_ctxIdxInc, 1 );
+        if( i_ctxIdxInc < 4 )
+        {
+            i_ctxIdxInc = 4;
+        }
+        else
+        {
+            i_ctxIdxInc = 5;
+        }
+        i_ref--;
+    }
+    x264_cabac_encode_decision( &h->cabac, 54 + i_ctxIdxInc, 0 );
+}
+
+static void  x264_cabac_mb_mvd( x264_t *h, int i_ctx, int i_ctx_inc, int mvd )
+{
+    int i_abs = abs( mvd );
+    int i_prefix = X264_MIN( i_abs, 9 );
+    int i;
+
+    for( i = 0; i < i_prefix; i++ )
+    {
+        x264_cabac_encode_decision( &h->cabac, i_ctx + i_ctx_inc, 1 );
+        if( i_ctx_inc < 3 )
+        {
+            i_ctx_inc = 3;
+        }
+        else if( i_ctx_inc < 6 )
+        {
+            i_ctx_inc++;
+        }
+    }
+    if( i_prefix < 9 )
+    {
+        x264_cabac_encode_decision( &h->cabac, i_ctx + i_ctx_inc, 0 );
+    }
+
+    if( i_prefix >= 9 )
+    {
+        int k = 3;
+        int i_suffix = i_abs - 9;
+
+        while( i_suffix >= (1<<k) )
+        {
+            x264_cabac_encode_bypass( &h->cabac, 1 );
+            i_suffix -= 1 << k;
+            k++;
+        }
+        x264_cabac_encode_bypass( &h->cabac, 0 );
+        while( k-- )
+        {
+            x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+        }
+    }
+
+    /* sign */
+    if( mvd > 0 )
+    {
+        x264_cabac_encode_bypass( &h->cabac, 0 );
+    }
+    else if( mvd < 0 )
+    {
+        x264_cabac_encode_bypass( &h->cabac, 1 );
+    }
+}
+
+static void  x264_cabac_mb_mv( x264_t *h, x264_macroblock_t *mb, int i_list, int i_part, int i_sub )
+{
+    x264_macroblock_t *mbn;
+
+    int mvd[2];
+    int x, y, xn, yn;
+    int i_ctxIdxInc;
+
+    int i_absmv0 = 0;
+    int i_absmv1 = 0;
+
+    /* get and update mvd */
+    x264_mb_partition_mvd( mb, i_list, i_part, i_sub, mvd );
+
+    /* get context */
+    x264_mb_partition_getxy( mb, i_part, i_sub, &x, &y );
+
+    /* FIXME not complete for B frame (B_DIRECT and B_DIRECT 8x8 sub */
+    /* Left  pixel (-1,0)*/
+    xn = x - 1;
+    mbn = mb;
+    if( xn < 0 )
+    {
+        xn += 4;
+        mbn = mb->context->mba;
+    }
+    if( mbn && !IS_INTRA( mbn->i_type ) && !IS_SKIP( mbn->i_type) )
+    {
+        i_absmv0 += abs( mbn->partition[xn][y].mvd[i_list][0] );
+        i_absmv1 += abs( mbn->partition[xn][y].mvd[i_list][1] );
+    }
+
+    /* Up ( pixel(0,-1)*/
+    yn = y - 1;
+    mbn = mb;
+    if( yn < 0 )
+    {
+        yn += 4;
+        mbn = mb->context->mbb;
+    }
+    if( mbn && !IS_INTRA( mbn->i_type ) && !IS_SKIP( mbn->i_type) )
+    {
+        i_absmv0 += abs( mbn->partition[x][yn].mvd[i_list][0] );
+        i_absmv1 += abs( mbn->partition[x][yn].mvd[i_list][1] );
+    }
+
+    /* x component */
+    if( i_absmv0 < 3 )
+    {
+        i_ctxIdxInc = 0;
+    }
+    else if( i_absmv0 > 32 )
+    {
+        i_ctxIdxInc = 2;
+    }
+    else
+    {
+        i_ctxIdxInc = 1;
+    }
+
+    x264_cabac_mb_mvd( h, 40, i_ctxIdxInc, mvd[0] );
+
+    /* y component */
+    if( i_absmv1 < 3 )
+    {
+        i_ctxIdxInc = 0;
+    }
+    else if( i_absmv1 > 32 )
+    {
+        i_ctxIdxInc = 2;
+    }
+    else
+    {
+        i_ctxIdxInc = 1;
+    }
+    x264_cabac_mb_mvd( h, 47, i_ctxIdxInc, mvd[1] );
+}
+static void x264_cabac_mb_sub_partition( x264_t *h, int i_sub )
+{
+    switch( i_sub )
+    {
+        case D_L0_8x8:
+            x264_cabac_encode_decision( &h->cabac, 21, 1 );
+            break;
+        case D_L0_8x4:
+            x264_cabac_encode_decision( &h->cabac, 21, 0 );
+            x264_cabac_encode_decision( &h->cabac, 22, 0 );
+            break;
+        case D_L0_4x8:
+            x264_cabac_encode_decision( &h->cabac, 21, 0 );
+            x264_cabac_encode_decision( &h->cabac, 22, 1 );
+            x264_cabac_encode_decision( &h->cabac, 23, 1 );
+            break;
+        case D_L0_4x4:
+            x264_cabac_encode_decision( &h->cabac, 21, 0 );
+            x264_cabac_encode_decision( &h->cabac, 22, 1 );
+            x264_cabac_encode_decision( &h->cabac, 23, 0 );
+            break;
+    }
+}
+
+static void block_residual_write_cabac( x264_t *h, x264_macroblock_t *mb, int i_ctxBlockCat, int i_idx, int *l, int i_count )
+{
+    static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 };
+
+    int i_coeff_abs_m1[16];
+    int i_coeff_sign[16];
+    int i_coeff = 0;
+    int i_last  = 0;
+
+    int i_abslevel1 = 0;
+    int i_abslevelgt1 = 0;
+
+    int i;
+
+    /* i_ctxBlockCat: 0-> DC 16x16  i_idx = 0
+     *                1-> AC 16x16  i_idx = luma4x4idx
+     *                2-> Luma4x4   i_idx = luma4x4idx
+     *                3-> DC Chroma i_idx = iCbCr
+     *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+     */
+
+    //fprintf( stderr, "l[] = " );
+    for( i = 0; i < i_count; i++ )
+    {
+        //fprintf( stderr, "%d ", l[i] );
+        if( l[i] != 0 )
+        {
+            i_coeff_abs_m1[i_coeff] = abs( l[i] ) - 1;
+            i_coeff_sign[i_coeff]   = ( l[i] < 0 ? 1 : 0);
+            i_coeff++;
+
+            i_last = i;
+        }
+    }
+    //fprintf( stderr, "\n" );
+
+    if( i_coeff == 0 )
+    {
+        /* codec block flag */
+        x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( mb, i_ctxBlockCat, i_idx ), 0 );
+        return;
+    }
+
+    /* block coded */
+    x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( mb, i_ctxBlockCat, i_idx ), 1 );
+    for( i = 0; i < i_count - 1; i++ )
+    {
+        int i_ctxIdxInc;
+
+        i_ctxIdxInc = X264_MIN( i, i_count - 2 );
+
+        if( l[i] != 0 )
+        {
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 1 );
+            x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, i == i_last ? 1 : 0 );
+        }
+        else
+        {
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 0 );
+        }
+        if( i == i_last )
+        {
+            break;
+        }
+    }
+
+    for( i = i_coeff - 1; i >= 0; i-- )
+    {
+        int i_prefix;
+        int i_ctxIdxInc;
+
+        /* write coeff_abs - 1 */
+
+        /* prefix */
+        i_prefix = X264_MIN( i_coeff_abs_m1[i], 14 );
+
+        i_ctxIdxInc = (i_abslevelgt1 != 0 ? 0 : X264_MIN( 4, i_abslevel1 + 1 )) + coeff_abs_level_m1_offset[i_ctxBlockCat];
+        if( i_prefix == 0 )
+        {
+            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+        }
+        else
+        {
+            int j;
+            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            i_ctxIdxInc = 5 + X264_MIN( 4, i_abslevelgt1 ) + coeff_abs_level_m1_offset[i_ctxBlockCat];
+            for( j = 0; j < i_prefix - 1; j++ )
+            {
+                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            }
+            if( i_prefix < 14 )
+            {
+                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+            }
+        }
+        /* suffix */
+        if( i_coeff_abs_m1[i] >= 14 )
+        {
+            int k = 0;
+            int i_suffix = i_coeff_abs_m1[i] - 14;
+
+            while( i_suffix >= (1<<k) )
+            {
+                x264_cabac_encode_bypass( &h->cabac, 1 );
+                i_suffix -= 1 << k;
+                k++;
+            }
+            x264_cabac_encode_bypass( &h->cabac, 0 );
+            while( k-- )
+            {
+                x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+            }
+        }
+
+        /* write sign */
+        x264_cabac_encode_bypass( &h->cabac, i_coeff_sign[i] );
+
+
+        if( i_coeff_abs_m1[i] == 0 )
+        {
+            i_abslevel1++;
+        }
+        else
+        {
+            i_abslevelgt1++;
+        }
+    }
+}
+
+
+
+void x264_macroblock_write_cabac( x264_t *h, bs_t *s, x264_macroblock_t *mb )
+{
+    int i;
+
+    /* Write the MB type */
+#if 0
+    fprintf( stderr, "[%d,%d] type=%d cbp=%d predc=%d\n",
+             mb->i_mb_x, mb->i_mb_y,
+             1 + mb->i_intra16x16_pred_mode + mb->i_cbp_chroma * 4 + ( mb->i_cbp_luma == 0 ? 0 : 12 ),
+             (mb->i_cbp_chroma << 4)|mb->i_cbp_luma,
+             mb->i_chroma_pred_mode );
+#endif
+    x264_cabac_mb_type( h, mb );
+
+    /* PCM special block type UNTESTED */
+    if( mb->i_type == I_PCM )
+    {
+        bs_align_0( s );    /* not sure */
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[0][mb->i_mb_y * 16 * h->picture->i_stride[0] + mb->i_mb_x * 16+i] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[1][mb->i_mb_y * 8 * h->picture->i_stride[1] + mb->i_mb_x * 8+i] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            bs_write( s, 8, h->picture->plane[2][mb->i_mb_y * 8 * h->picture->i_stride[2] + mb->i_mb_x * 8+i] );
+        }
+
+        for( i = 0; i < 16 + 8; i++ )
+        {
+            /* special case */
+            mb->block[i].i_non_zero_count = 16;
+        }
+
+        x264_cabac_encode_init( &h->cabac, s );
+        return;
+    }
+
+    if( IS_INTRA( mb->i_type ) )
+    {
+        /* Prediction */
+        if( mb->i_type == I_4x4 )
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                x264_cabac_mb_intra4x4_pred_mode( h, mb,
+                                                  x264_mb_predict_intra4x4_mode( h, mb, i ),
+                                                  mb->block[i].i_intra4x4_pred_mode );
+            }
+        }
+        x264_cabac_mb_intra8x8_pred_mode( h, mb );
+    }
+    else if( mb->i_type == P_8x8 )
+    {
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            x264_cabac_mb_sub_partition( h, mb->i_sub_partition[i] );
+        }
+        /* ref 0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            for( i = 0; i < 4; i++ )
+            {
+                x264_cabac_mb_ref( h, mb, 0, i );
+            }
+        }
+
+        for( i = 0; i < 4; i++ )
+        {
+            int i_sub;
+            for( i_sub = 0; i_sub < x264_mb_partition_count_table[mb->i_sub_partition[i]]; i_sub++ )
+            {
+                x264_cabac_mb_mv( h, mb, 0, i, i_sub );
+            }
+        }
+    }
+    else if( mb->i_type == B_8x8 )
+    {
+        /* TODO */
+        fprintf( stderr, "Arggg B_8x8\n" );
+    }
+    else if( mb->i_type != B_DIRECT )
+    {
+        /* FIXME -> invalid for B frame */
+
+        /* Motion Vector */
+        int i_part = x264_mb_partition_count_table[mb->i_partition];
+
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+        {
+            for( i = 0; i < i_part; i++ )
+            {
+                if( mb->i_type == P_L0 )
+                {
+                    x264_cabac_mb_ref( h, mb, 0, i );
+                }
+            }
+        }
+
+        for( i = 0; i < i_part; i++ )
+        {
+            if( mb->i_type == P_L0 )
+            {
+                x264_cabac_mb_mv( h, mb, 0, i, 0 );
+            }
+        }
+    }
+
+    if( mb->i_type != I_16x16 )
+    {
+        x264_cabac_mb_cbp_luma( h, mb );
+        x264_cabac_mb_cbp_chroma( h, mb );
+    }
+
+    if( mb->i_cbp_luma > 0 || mb->i_cbp_chroma > 0 || mb->i_type == I_16x16 )
+    {
+        x264_cabac_mb_qp_delta( h, mb );
+
+        /* write residual */
+        if( mb->i_type == I_16x16 )
+        {
+            /* DC Luma */
+            block_residual_write_cabac( h, mb, 0, 0, mb->luma16x16_dc, 16 );
+
+            if( mb->i_cbp_luma != 0 )
+            {
+                /* AC Luma */
+                for( i = 0; i < 16; i++ )
+                {
+                    block_residual_write_cabac( h, mb, 1, i, mb->block[i].residual_ac, 15 );
+                }
+            }
+        }
+        else
+        {
+            for( i = 0; i < 16; i++ )
+            {
+                if( mb->i_cbp_luma & ( 1 << ( i / 4 ) ) )
+                {
+                    block_residual_write_cabac( h, mb, 2, i, mb->block[i].luma4x4, 16 );
+                }
+            }
+        }
+
+        if( mb->i_cbp_chroma &0x03 )    /* Chroma DC residual present */
+        {
+            block_residual_write_cabac( h, mb, 3, 0, mb->chroma_dc[0], 4 );
+            block_residual_write_cabac( h, mb, 3, 1, mb->chroma_dc[1], 4 );
+        }
+        if( mb->i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cabac( h, mb, 4, i, mb->block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
+
diff --git a/tools/.cvsignore b/tools/.cvsignore
new file mode 100644
index 00000000..bafdf1a1
--- /dev/null
+++ b/tools/.cvsignore
@@ -0,0 +1,2 @@
+xyuv
+avc2avi
diff --git a/tools/Jamfile b/tools/Jamfile
new file mode 100644
index 00000000..8507f195
--- /dev/null
+++ b/tools/Jamfile
@@ -0,0 +1,9 @@
+# Jamfile
+
+# Little tool to embed h264 into avi
+Main avc2avi : avc2avi.c ;
+
+# Little YUV I420 player
+LINKLIBS = `sdl-config --libs` ;
+Main xyuv : xyuv.c ;
+
diff --git a/tools/avc2avi.c b/tools/avc2avi.c
new file mode 100644
index 00000000..07f142bb
--- /dev/null
+++ b/tools/avc2avi.c
@@ -0,0 +1,820 @@
+/*****************************************************************************
+ * avc2avi.c: raw h264 -> AVI
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: avc2avi.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <signal.h>
+#define _GNU_SOURCE
+#include <getopt.h>
+
+#ifdef _MSC_VER
+#include <io.h>     /* _setmode() */
+#include <fcntl.h>  /* _O_BINARY */
+#endif
+
+#include "../core/bs.h"
+
+#define DATA_MAX 3000000
+uint8_t data[DATA_MAX];
+
+/* Ctrl-C handler */
+static int     i_ctrl_c = 0;
+static void    SigIntHandler( int a )
+{
+    i_ctrl_c = 1;
+}
+
+typedef struct
+{
+    char *psz_fin;
+    char *psz_fout;
+
+    float f_fps;
+    char  fcc[4];
+} cfg_t;
+
+typedef struct
+{
+    int i_data;
+    int i_data_max;
+    uint8_t *p_data;
+} vbuf_t;
+
+void vbuf_init( vbuf_t * );
+void vbuf_add( vbuf_t *, int i_data, void *p_data );
+void vbuf_reset( vbuf_t * );
+
+typedef struct
+{
+    FILE *f;
+
+    float f_fps;
+    char  fcc[4];
+
+    int   i_width;
+    int   i_height;
+
+    int64_t i_movi;
+    int64_t i_movi_end;
+    int64_t i_riff;
+
+    int      i_frame;
+    int      i_idx_max;
+    uint32_t *idx;
+} avi_t;
+
+void avi_init( avi_t *, FILE *, float, char fcc[4] );
+void avi_write( avi_t *, vbuf_t *, int  );
+void avi_end( avi_t * );
+
+enum nal_unit_type_e
+{
+    NAL_UNKNOWN = 0,
+    NAL_SLICE   = 1,
+    NAL_SLICE_DPA   = 2,
+    NAL_SLICE_DPB   = 3,
+    NAL_SLICE_DPC   = 4,
+    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+    NAL_SEI         = 6,    /* ref_idc == 0 */
+    NAL_SPS         = 7,
+    NAL_PPS         = 8
+    /* ref_idc == 0 for 6,9,10,11,12 */
+};
+enum nal_priority_e
+{
+    NAL_PRIORITY_DISPOSABLE = 0,
+    NAL_PRIORITY_LOW        = 1,
+    NAL_PRIORITY_HIGH       = 2,
+    NAL_PRIORITY_HIGHEST    = 3,
+};
+
+typedef struct
+{
+    int i_ref_idc;  /* nal_priority_e */
+    int i_type;     /* nal_unit_type_e */
+
+    /* This data are raw payload */
+    int     i_payload;
+    uint8_t *p_payload;
+} nal_t;
+
+typedef struct
+{
+    int i_width;
+    int i_height;
+
+    int i_nal_type;
+    int i_ref_idc;
+    int i_idr_pic_id;
+    int i_frame_num;
+
+    int b_key;
+    int i_log2_max_frame_num;
+} h264_t;
+
+void h264_parser_init( h264_t * );
+void h264_parser_parse( h264_t *h, nal_t *n, int *pb_nal_start );
+
+
+static int nal_decode( nal_t *nal, void *p_data, int i_data );
+
+static void Help( void );
+static int  Parse( int argc, char **argv, cfg_t * );
+static int  ParseNAL( nal_t *nal, avi_t *a, h264_t *h, int *pb_slice );
+
+/****************************************************************************
+ * main:
+ ****************************************************************************/
+int main( int argc, char **argv )
+{
+    cfg_t cfg;
+
+    FILE    *fout;
+    FILE    *fin;
+
+    vbuf_t  vb;
+    avi_t   avi;
+    h264_t  h264;
+
+    nal_t nal;
+    int i_frame;
+    int i_data;
+    int b_eof;
+    int b_key;
+    int b_slice;
+
+#ifdef _MSC_VER
+    _setmode(_fileno(stdin), _O_BINARY);    /* thanks to Marcos Morais <morais at dee.ufcg.edu.br> */
+    _setmode(_fileno(stdout), _O_BINARY);
+#endif
+
+    /* Parse command line */
+    if( Parse( argc, argv, &cfg ) < 0 )
+    {
+        return -1;
+    }
+
+    /* Open input */
+    if( cfg.psz_fin == NULL || *cfg.psz_fin == '\0' || !strcmp( cfg.psz_fin, "-" ) )
+        fin = stdin;
+    else
+        fin = fopen( cfg.psz_fin, "rb" );
+    if( fin == NULL )
+    {
+        fprintf( stderr, "cannot open input file\n" );
+        return -1;
+    }
+
+    /* Open output */
+    if( cfg.psz_fout == NULL || *cfg.psz_fout == '\0' || !strcmp( cfg.psz_fout, "-" ) )
+        fout = stdin;
+    else
+        fout = fopen( cfg.psz_fout, "wb" );
+    if( fout == NULL )
+    {
+        fprintf( stderr, "cannot open output file\n" );
+        return -1;
+    }
+
+    /* Init avi */
+    avi_init( &avi, fout, cfg.f_fps, cfg.fcc );
+
+    /* Init parser */
+    h264_parser_init( &h264 );
+
+    /* Control-C handler */
+    signal( SIGINT, SigIntHandler );
+
+    /* Init data */
+    b_eof = 0;
+    b_key = 0;
+    b_slice = 0;
+    i_frame = 0;
+    i_data  = 0;
+
+    /* Alloc space for a nal, used for decoding pps/sps/slice header */
+    nal.p_payload = malloc( DATA_MAX );
+
+    vbuf_init( &vb );
+
+    /* split frame */
+    while( !i_ctrl_c )
+    {
+        uint8_t *p, *p_next, *end;
+        int i_size;
+
+        /* fill buffer */
+        if( i_data < DATA_MAX && !b_eof )
+        {
+            int i_read = fread( &data[i_data], 1, DATA_MAX - i_data, fin );
+            if( i_read <= 0 )
+                b_eof = 1;
+            else
+                i_data += i_read;
+        }
+        if( i_data < 3 )
+            break;
+
+        end = &data[i_data];
+
+        /* Search begin of a NAL */
+        p = &data[0];
+        while( p < end - 3 )
+        {
+            if( p[0] == 0x00 && p[1] == 0x00 && p[2] == 0x01 )
+            {
+                break;
+            }
+            p++;
+        }
+
+        if( p >= end - 3 )
+        {
+            fprintf( stderr, "garbage (i_data = %d)\n", i_data );
+            i_data = 0;
+            continue;
+        }
+
+        /* Search end of NAL */
+        p_next = p + 3;
+        while( p_next < end - 3 )
+        {
+            if( p_next[0] == 0x00 && p_next[1] == 0x00 && p_next[2] == 0x01 )
+            {
+                break;
+            }
+            p_next++;
+        }
+
+        if( p_next == end - 3 && i_data < DATA_MAX )
+            p_next = end;
+
+        /* Compute NAL size */
+        i_size = p_next - p - 3;
+        if( i_size <= 0 )
+        {
+            if( b_eof )
+                break;
+
+            fprintf( stderr, "nal too large (FIXME) ?\n" );
+            i_data = 0;
+            continue;
+        }
+
+        /* Nal start at p+3 with i_size length */
+        nal_decode( &nal, p +3, i_size < 2048 ? i_size : 2048 );
+
+        b_key = h264.b_key;
+
+        if( b_slice && vb.i_data && ( nal.i_type == NAL_SPS || nal.i_type == NAL_PPS ) )
+        {
+            avi_write( &avi, &vb, b_key );
+            vbuf_reset( &vb );
+            b_slice = 0;
+        }
+
+        /* Parse SPS/PPS/Slice */
+        if( ParseNAL( &nal, &avi, &h264, &b_slice ) && vb.i_data > 0 )
+        {
+            avi_write( &avi, &vb, b_key );
+            vbuf_reset( &vb );
+        }
+
+        /* fprintf( stderr, "nal:%d ref:%d\n", nal.i_type, nal.i_ref_idc ); */
+
+        /* Append NAL to buffer */
+        vbuf_add( &vb, i_size + 3, p );
+
+        /* Remove this nal */
+        memmove( &data[0], p_next, end - p_next );
+        i_data -= p_next - &data[0];
+    }
+
+    if( vb.i_data > 0 )
+    {
+        avi_write( &avi, &vb, h264.b_key );
+    }
+
+    avi.i_width  = h264.i_width;
+    avi.i_height = h264.i_height;
+
+    avi_end( &avi );
+
+    /* free mem */
+    free( nal.p_payload );
+
+    fclose( fin );
+    fclose( fout );
+
+    return 0;
+}
+
+/*****************************************************************************
+ * Help:
+ *****************************************************************************/
+static void Help( void )
+{
+    fprintf( stderr,
+             "avc2avi\n"
+             "Syntax: avc2avi [options] [ -i input.h264 ] [ -o output.avi ]\n"
+             "\n"
+             "  -h, --help                  Print this help\n"
+             "\n"
+             "  -i, --input                 Specify input file (default: stdin)\n"
+             "  -o, --output                Specify output file (default: stdout)\n"
+             "\n"
+             "  -f, --fps <float>           Set FPS (default: 25.0)\n"
+             "  -c, --codec <string>        Set the codec fourcc (default: 'h264')\n"
+             "\n" );
+}
+
+/*****************************************************************************
+ * Parse:
+ *****************************************************************************/
+static int  Parse( int argc, char **argv, cfg_t *cfg )
+{
+    /* Set default values */
+    cfg->psz_fin = NULL;
+    cfg->psz_fout = NULL;
+    cfg->f_fps = 25.0;
+    memcpy( cfg->fcc, "h264", 4 );
+
+    /* Parse command line options */
+    opterr = 0; // no error message
+    for( ;; )
+    {
+        int long_options_index;
+        static struct option long_options[] =
+        {
+            { "help",   no_argument,       NULL, 'h' },
+            { "input",  required_argument, NULL, 'i' },
+            { "output", required_argument, NULL, 'o' },
+            { "fps",    required_argument, NULL, 'f' },
+            { "codec",  required_argument, NULL, 'c' },
+            {0, 0, 0, 0}
+        };
+
+        int c;
+
+        c = getopt_long( argc, argv, "hi:o:f:c:",
+                         long_options, &long_options_index);
+
+        if( c == -1 )
+        {
+            break;
+        }
+
+        switch( c )
+        {
+            case 'h':
+                Help();
+                return -1;
+
+            case 0:
+                break;
+            case 'i':
+                cfg->psz_fin = strdup( optarg );
+                break;
+            case 'o':
+                cfg->psz_fout = strdup( optarg );
+                break;
+            case 'f':
+                cfg->f_fps = atof( optarg );
+                break;
+            case 'c':
+                memset( cfg->fcc, ' ', 4 );
+                memcpy( cfg->fcc, optarg, strlen( optarg ) < 4 ? strlen( optarg ) : 4 );
+                break;
+
+            default:
+                fprintf( stderr, "unknown option (%c)\n", optopt );
+                return -1;
+        }
+    }
+
+
+    return 0;
+}
+
+/*****************************************************************************
+ * h264_parser_*:
+ *****************************************************************************/
+void h264_parser_init( h264_t *h )
+{
+    h->i_width = 0;
+    h->i_height = 0;
+    h->b_key = 0;
+    h->i_nal_type = -1;
+    h->i_ref_idc = -1;
+    h->i_idr_pic_id = -1;
+    h->i_frame_num = -1;
+    h->i_log2_max_frame_num = 0;
+}
+void h264_parser_parse( h264_t *h, nal_t *nal, int *pb_nal_start )
+{
+    bs_t s;
+    *pb_nal_start = 0;
+
+    if( nal->i_type == NAL_SPS || nal->i_type == NAL_PPS )
+        *pb_nal_start = 1;
+
+    bs_init( &s, nal->p_payload, nal->i_payload );
+    if( nal->i_type == NAL_SPS )
+    {
+        int i_tmp;
+
+        bs_skip( &s, 8 + 1+1+1 + 5 + 8 );
+        /* sps id */
+        bs_read_ue( &s );
+        /* Skip i_log2_max_frame_num */
+        h->i_log2_max_frame_num = bs_read_ue( &s ) + 4;
+        /* Read poc_type */
+        i_tmp = bs_read_ue( &s );
+        if( i_tmp == 0 )
+        {
+            /* skip i_log2_max_poc_lsb */
+            bs_read_ue( &s );
+        }
+        else if( i_tmp == 1 )
+        {
+            int i_cycle;
+            /* skip b_delta_pic_order_always_zero */
+            bs_skip( &s, 1 );
+            /* skip i_offset_for_non_ref_pic */
+            bs_read_se( &s );
+            /* skip i_offset_for_top_to_bottom_field */
+            bs_read_se( &s );
+            /* read i_num_ref_frames_in_poc_cycle */
+            i_cycle = bs_read_ue( &s ); 
+            if( i_cycle > 256 ) i_cycle = 256;
+            while( i_cycle > 0 )
+            {
+                /* skip i_offset_for_ref_frame */
+                bs_read_se(&s );
+            }
+        }
+        /* i_num_ref_frames */
+        bs_read_ue( &s );
+        /* b_gaps_in_frame_num_value_allowed */
+        bs_skip( &s, 1 );
+
+        /* Read size */
+        h->i_width  = 16 * ( bs_read_ue( &s ) + 1 );
+        h->i_height = 16 * ( bs_read_ue( &s ) + 1 );
+
+        /* b_frame_mbs_only */
+        i_tmp = bs_read( &s, 1 );
+        if( i_tmp == 0 )
+        {
+            bs_skip( &s, 1 );
+        }
+        /* b_direct8x8_inference */
+        bs_skip( &s, 1 );
+
+        /* crop ? */
+        i_tmp = bs_read( &s, 1 );
+        if( i_tmp )
+        {
+            /* left */
+            h->i_width -= 2 * bs_read_ue( &s );
+            /* right */
+            h->i_width -= 2 * bs_read_ue( &s );
+            /* top */
+            h->i_height -= 2 * bs_read_ue( &s );
+            /* bottom */
+            h->i_height -= 2 * bs_read_ue( &s );
+        }
+
+        /* vui: ignored */
+    }
+    else if( nal->i_type >= NAL_SLICE && nal->i_type <= NAL_SLICE_IDR )
+    {
+        int i_tmp;
+
+        /* i_first_mb */
+        bs_read_ue( &s );
+        /* picture type */
+        switch( bs_read_ue( &s ) )
+        {
+            case 0: case 5: /* P */
+            case 1: case 6: /* B */
+            case 3: case 8: /* SP */
+                h->b_key = 0;
+                break;
+            case 2: case 7: /* I */
+                h->b_key = 1;
+                break;
+            case 4: case 9: /* ? */
+                h->b_key = 1;
+                break;
+        }
+        /* pps id */
+        bs_read_ue( &s );
+
+        /* frame num */
+        i_tmp = bs_read( &s, h->i_log2_max_frame_num );
+
+        if( i_tmp != h->i_frame_num )
+            *pb_nal_start = 1;
+
+        h->i_frame_num = i_tmp;
+
+        if( nal->i_type == NAL_SLICE_IDR )
+        {
+            i_tmp = bs_read_ue( &s );
+            if( h->i_nal_type == NAL_SLICE_IDR && h->i_idr_pic_id != i_tmp )
+                *pb_nal_start = 1;
+
+            h->i_idr_pic_id = i_tmp;
+        }
+    }
+    h->i_nal_type = nal->i_type;
+    h->i_ref_idc = nal->i_ref_idc;
+}
+
+
+static int  ParseNAL( nal_t *nal, avi_t *a, h264_t *h, int *pb_slice )
+{
+    int b_flush = 0;
+    int b_start;
+
+    h264_parser_parse( h, nal, &b_start );
+
+    if( b_start && *pb_slice )
+    {
+        b_flush = 1;
+        *pb_slice = 0;
+    }
+
+    if( nal->i_type >= NAL_SLICE && nal->i_type <= NAL_SLICE_IDR )
+        *pb_slice = 1;
+
+    return b_flush;
+}
+
+/*****************************************************************************
+ * vbuf: variable buffer
+ *****************************************************************************/
+void vbuf_init( vbuf_t *v )
+{
+    v->i_data = 0;
+    v->i_data_max = 10000;
+    v->p_data = malloc( v->i_data_max );
+}
+void vbuf_add( vbuf_t *v, int i_data, void *p_data )
+{
+    if( i_data + v->i_data >= v->i_data_max )
+    {
+        v->i_data_max += i_data;
+        v->p_data = realloc( v->p_data, v->i_data_max );
+    }
+    memcpy( &v->p_data[v->i_data], p_data, i_data );
+
+    v->i_data += i_data;
+}
+void vbuf_reset( vbuf_t *v )
+{
+    v->i_data = 0;
+}
+
+/*****************************************************************************
+ * avi:
+ *****************************************************************************/
+void avi_write_uint16( avi_t *a, uint16_t w )
+{
+    fputc( ( w      ) & 0xff, a->f );
+    fputc( ( w >> 8 ) & 0xff, a->f );
+}
+
+void avi_write_uint32( avi_t *a, uint32_t dw )
+{
+    fputc( ( dw      ) & 0xff, a->f );
+    fputc( ( dw >> 8 ) & 0xff, a->f );
+    fputc( ( dw >> 16) & 0xff, a->f );
+    fputc( ( dw >> 24) & 0xff, a->f );
+}
+
+void avi_write_fourcc( avi_t *a, char fcc[4] )
+{
+    fputc( fcc[0], a->f );
+    fputc( fcc[1], a->f );
+    fputc( fcc[2], a->f );
+    fputc( fcc[3], a->f );
+}
+
+/* Flags in avih */
+#define AVIF_HASINDEX       0x00000010  // Index at end of file?
+#define AVIF_ISINTERLEAVED  0x00000100
+#define AVIF_TRUSTCKTYPE    0x00000800  // Use CKType to find key frames?
+
+#define AVIIF_KEYFRAME      0x00000010L /* this frame is a key frame.*/
+
+void avi_write_header( avi_t *a )
+{
+    avi_write_fourcc( a, "RIFF" );
+    avi_write_uint32( a, a->i_riff > 0 ? a->i_riff - 8 : 0xFFFFFFFF );
+    avi_write_fourcc( a, "AVI " );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  4 + 4*16 + 12 + 4*16 + 4*12 );
+    avi_write_fourcc( a, "hdrl" );
+
+    avi_write_fourcc( a, "avih" );
+    avi_write_uint32( a, 4*16 - 8 );
+    avi_write_uint32( a, 1000000 / a->f_fps );
+    avi_write_uint32( a, 0xffffffff );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, AVIF_HASINDEX|AVIF_ISINTERLEAVED|AVIF_TRUSTCKTYPE);
+    avi_write_uint32( a, a->i_frame );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 1 );
+    avi_write_uint32( a, 1000000 );
+    avi_write_uint32( a, a->i_width );
+    avi_write_uint32( a, a->i_height );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  4 + 4*16 + 4*12 );
+    avi_write_fourcc( a, "strl" );
+
+    avi_write_fourcc( a, "strh" );
+    avi_write_uint32( a,  4*16 - 8 );
+    avi_write_fourcc( a, "vids" );
+    avi_write_fourcc( a, a->fcc );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 1000 );
+    avi_write_uint32( a, a->f_fps * 1000 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, a->i_frame );
+    avi_write_uint32( a, 1024*1024 );
+    avi_write_uint32( a, -1 );
+    avi_write_uint32( a, a->i_width * a->i_height );
+    avi_write_uint32( a, 0 );
+    avi_write_uint16( a, a->i_width );
+    avi_write_uint16( a, a->i_height );
+
+    avi_write_fourcc( a, "strf" );
+    avi_write_uint32( a,  4*12 - 8 );
+    avi_write_uint32( a,  4*12 - 8 );
+    avi_write_uint32( a,  a->i_width );
+    avi_write_uint32( a,  a->i_height );
+    avi_write_uint16( a,  1 );
+    avi_write_uint16( a,  24 );
+    avi_write_fourcc( a,  a->fcc );
+    avi_write_uint32( a, a->i_width * a->i_height );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  a->i_movi_end > 0 ? a->i_movi_end - a->i_movi + 4: 0xFFFFFFFF );
+    avi_write_fourcc( a, "movi" );
+}
+
+void avi_write_idx( avi_t *a )
+{
+    avi_write_fourcc( a, "idx1" );
+    avi_write_uint32( a,  a->i_frame * 16 );
+    fwrite( a->idx, a->i_frame * 16, 1, a->f );
+}
+
+void avi_init( avi_t *a, FILE *f, float f_fps, char fcc[4] )
+{
+    a->f = f;
+    a->f_fps = f_fps;
+    memcpy( a->fcc, fcc, 4 );
+    a->i_width = 0;
+    a->i_height = 0;
+    a->i_frame = 0;
+    a->i_movi = 0;
+    a->i_riff = 0;
+    a->i_movi_end = 0;
+    a->i_idx_max = 0;
+    a->idx = NULL;
+
+    avi_write_header( a );
+
+    a->i_movi = ftell( a->f );
+}
+
+static void avi_set_dw( void *_p, uint32_t dw )
+{
+    uint8_t *p = _p;
+
+    p[0] = ( dw      )&0xff;
+    p[1] = ( dw >> 8 )&0xff;
+    p[2] = ( dw >> 16)&0xff;
+    p[3] = ( dw >> 24)&0xff;
+}
+
+void avi_write( avi_t *a, vbuf_t *v, int b_key )
+{
+    int64_t i_pos = ftell( a->f );
+
+    /* chunk header */
+    avi_write_fourcc( a, "00dc" );
+    avi_write_uint32( a, v->i_data );
+
+    fwrite( v->p_data, v->i_data, 1, a->f );
+
+    if( v->i_data&0x01 )
+    {
+        /* pad */
+        fputc( 0, a->f );
+    }
+
+    /* Append idx chunk */
+    if( a->i_idx_max <= a->i_frame )
+    {
+        a->i_idx_max += 1000;
+        a->idx = realloc( a->idx, a->i_idx_max * 16 );
+    }
+
+    memcpy( &a->idx[4*a->i_frame+0], "00dc", 4 );
+    avi_set_dw( &a->idx[4*a->i_frame+1], b_key ? AVIIF_KEYFRAME : 0 );
+    avi_set_dw( &a->idx[4*a->i_frame+2], i_pos );
+    avi_set_dw( &a->idx[4*a->i_frame+3], v->i_data );
+
+    a->i_frame++;
+}
+
+void avi_end( avi_t *a )
+{
+    a->i_movi_end = ftell( a->f );
+
+    /* write index */
+    avi_write_idx( a );
+
+    a->i_riff = ftell( a->f );
+
+    /* Fix header */
+    fseek( a->f, 0, SEEK_SET );
+    avi_write_header( a );
+
+    fprintf( stderr, "avi file written\n" );
+    fprintf( stderr, "  - codec: %4.4s\n", a->fcc );
+    fprintf( stderr, "  - size: %dx%d\n", a->i_width, a->i_height );
+    fprintf( stderr, "  - fps: %.3f\n", a->f_fps );
+    fprintf( stderr, "  - frames: %d\n", a->i_frame );
+}
+
+/*****************************************************************************
+ * nal:
+ *****************************************************************************/
+int nal_decode( nal_t *nal, void *p_data, int i_data )
+{
+    uint8_t *src = p_data;
+    uint8_t *end = &src[i_data];
+    uint8_t *dst = nal->p_payload;
+
+    nal->i_type    = src[0]&0x1f;
+    nal->i_ref_idc = (src[0] >> 5)&0x03;
+
+    src++;
+
+    while( src < end )
+    {
+        if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00  && src[2] == 0x03 )
+        {
+            *dst++ = 0x00;
+            *dst++ = 0x00;
+
+            src += 3;
+            continue;
+        }
+        *dst++ = *src++;
+    }
+
+    nal->i_payload = dst - (uint8_t*)p_data;
+    return 0;
+}
+
diff --git a/tools/x264-rd.sh b/tools/x264-rd.sh
new file mode 100755
index 00000000..8a287b56
--- /dev/null
+++ b/tools/x264-rd.sh
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+X264="../x264"
+YUV="/usr/src/yuv/af-720x576.yuv"
+OUT="/tmp/x264-$$.h264"
+
+DAT="x264-rd.dat"
+
+OPTS="-c"
+
+# Init
+rm -f "$DAT"
+echo "#QP kb/s   PSNR Y     U     V     fps" > $DAT
+
+for qp in `seq 1 51`
+do
+    LOG="/tmp/x264-$qp-$$.log"
+    # clean
+    rm -f "$LOG"
+    # encode
+    $X264 "$YUV" -o "$OUT" --qp $qp $OPTS 2> "$LOG"
+    # gather stats
+    cat "$LOG" |
+    grep '^x264: overall' |
+    sed 's/^x264: overall PSNR Y:\([[:digit:]]*\.[[:digit:]]*\) U:\([[:digit:]]*\.[[:digit:]]*\) V:\([[:digit:]]*\.[[:digit:]]*\) kb\/s:\([[:digit:]]*\.[[:digit:]]*\) fps:\([[:digit:]]*\.[[:digit:]]*\)$/\1 \2 \3 \4 \5/g' |
+    awk -v QP=$qp '{ printf( "%2d %7.1f      %5.2f %5.2f %5.2f %5.3f\n", QP, $4, $1, $2, $3, $5 ); }' >> $DAT
+done
+
+# Clean
+rm -f "$OUT"
+rm -f "$LOG"
+
diff --git a/tools/xyuv.c b/tools/xyuv.c
new file mode 100644
index 00000000..0a5c0928
--- /dev/null
+++ b/tools/xyuv.c
@@ -0,0 +1,607 @@
+/*****************************************************************************
+ * xyuv.c: a SDL yuv 420 planer viewer.
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: xyuv.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <SDL/SDL.h>
+
+#define YUV_MAX 20
+#define SDL_TITLE "xyuv: %s - %d/%d - %.2ffps"
+typedef struct
+{
+    /* globals */
+    int     i_width;
+    int     i_height;
+    int     i_frame_size;
+    int     i_frame;
+    int     i_frames;
+    float   f_fps;
+
+    float   f_y;
+
+    int     b_pause;
+    int     b_grid;
+    int     b_split;
+    int     b_diff;
+    int     i_join;
+
+    /* Constructed picture */
+    int     i_wall_width;   /* in picture count */
+
+    /* YUV files */
+    int     i_yuv;
+    struct
+    {
+        char    *name;
+        FILE    *f;         /* handles */
+        int     i_frames;   /* frames count */
+
+        /* Position in the whole picture */
+        int     x, y;
+    } yuv[YUV_MAX];
+
+    /* SDL */
+    int i_sdl_width;
+    int i_sdl_height;
+
+    int i_display_width;
+    int i_display_height;
+    char *title;
+
+    SDL_Surface *screen;
+    SDL_Overlay *overlay;
+
+    /* */
+    uint8_t *pic;
+
+} xyuv_t;
+
+xyuv_t xyuv = {
+    .i_width = 0,
+    .i_height = 0,
+    .i_frame  = 1,
+    .i_frames = 0,
+    .f_fps = 25.0,
+    .f_y = 0.0,
+    .i_wall_width = 0,
+
+    .i_yuv = 0,
+
+    .b_pause = 0,
+    .b_split = 0,
+    .b_diff = 0,
+    .i_join = -1,
+
+    .title = NULL,
+    .pic = NULL,
+};
+
+static void help( void )
+{
+    fprintf( stderr,
+             "Syntax: xyuv [options] file [file2 ...]\n"
+             "\n"
+             "      --help                  Print this help\n"
+             "\n"
+             "  -s, --size <WIDTHxHEIGHT>   Set input size\n"
+             "  -w, --width <integer>       Set width\n"
+             "  -h, --height <integer>      Set height\n"
+             "\n"
+             "  -S, --split                 Show splited Y/U/V planes\n"
+             "  -d, --diff                  Show difference (only 2 files) in split mode\n"
+             "  -j, --joint <integer>\n"
+             "\n"
+             "  -y <float>                  Set Y factor\n"
+             "\n"
+             "  -g, --grid                  Show a grid (macroblock 16x16)\n"
+             "  -W <integer>                Set wall width (in picture count)\n"
+             "  -f, --fps <float>           Set fps\n"
+             "\n" );
+}
+
+
+static void xyuv_display( xyuv_t *xyuv, int i_frame );
+
+int main( int argc, char **argv )
+{
+    int i;
+
+    /* Parse commande line */
+    for( i = 1; i < argc; i++ ) {
+        if( !strcasecmp( argv[i], "--help" ) ) {
+            help();
+            return 0;
+        }
+        if( !strcmp( argv[i], "-d" ) || !strcasecmp( argv[i], "--diff" ) ) {
+            xyuv.b_diff = 1;
+        } else if( !strcmp( argv[i], "-S" ) || !strcasecmp( argv[i], "--split" ) ) {
+            xyuv.b_split = 1;
+        } else if( !strcmp( argv[i], "-f" ) || !strcasecmp( argv[i], "--fps" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.f_fps = atof( argv[++i] );
+        } else if( !strcmp( argv[i], "-h" ) || !strcasecmp( argv[i], "--height" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_height = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-w" ) || !strcasecmp( argv[i], "--width" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_width = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-s" ) || !strcasecmp( argv[i], "--size" ) ) {
+            char *p;
+
+            if( i >= argc -1 ) goto err_missing_arg;
+
+            xyuv.i_width = strtol( argv[++i], &p, 0 );
+            p++;
+            xyuv.i_height = atoi( p );
+        } else if( !strcmp( argv[i], "-W" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_wall_width = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-y" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.f_y = atof( argv[++i] );
+        } else if( !strcmp( argv[i], "-j" ) || !strcasecmp( argv[i], "--join" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_join = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-g" ) || !strcasecmp( argv[i], "--grid" ) ) {
+            xyuv.b_grid = 1;
+        } else {
+            FILE *f = fopen( argv[i], "rb" );
+            if( !f ) {
+                fprintf( stderr, "cannot open YUV %s\n", argv[i] );
+            } else {
+                xyuv.yuv[xyuv.i_yuv].name = strdup( argv[i] );
+                xyuv.yuv[xyuv.i_yuv].f = f;
+                xyuv.yuv[xyuv.i_yuv].i_frames = 0;
+
+                xyuv.i_yuv++;
+            }
+        }
+    }
+
+    if( xyuv.i_yuv == 0 ) {
+        fprintf( stderr, "no file to display\n" );
+        return -1;
+    }
+    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
+        char *psz = xyuv.yuv[0].name;
+        char *num;
+        char *x;
+        /* See if we find widthxheight in the file name */
+        for( ;; )
+        {
+            if( !( x = strchr( psz+1, 'x' ) ) )
+            {
+                break;
+            }
+            num = x;
+            while( num > psz && num[-1] >= '0' && num[-1] <= '9' )
+                num--;
+
+            if( num != x && x[1] >= '0' && x[1] <= '9' )
+            {
+                xyuv.i_width = atoi( num );
+                xyuv.i_height = atoi( x+1 );
+                break;
+            }
+            psz = x;
+        }
+        fprintf( stderr, "file name gives %dx%d\n", xyuv.i_width, xyuv.i_height );
+    }
+    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
+        fprintf( stderr, "invalid or missing frames size\n" );
+        return -1;
+    }
+    if( xyuv.b_diff && xyuv.i_yuv != 2 ) {
+        fprintf( stderr, "--diff works only with 2 files\n" );
+        return -1;
+    }
+    if( (xyuv.i_join == 0 || xyuv.i_join >= xyuv.i_width) && xyuv.i_yuv != 2 ) {
+        fprintf( stderr, "--join woeks only with two files and range is [1, width-1]\n" );
+        return -1;
+    }
+    if( xyuv.i_join % 2 != 0 ) {
+        if( xyuv.i_join + 1 < xyuv.i_width )
+            xyuv.i_join++;
+        else
+            xyuv.i_join--;
+    }
+
+    /* Now check frames */
+    fprintf( stderr, "displaying :\n" );
+    xyuv.i_frames = 0;
+    xyuv.i_frame_size = 3 * xyuv.i_width * xyuv.i_height / 2;
+    for( i = 0; i < xyuv.i_yuv; i++ ) {
+        /* Beurk but avoid using fstat */
+        fseek( xyuv.yuv[i].f, 0, SEEK_END );
+
+        xyuv.yuv[i].i_frames = ftell( xyuv.yuv[i].f ) / xyuv.i_frame_size;
+
+        fseek( xyuv.yuv[i].f, 0, SEEK_SET );
+
+        fprintf( stderr, " - '%s' : %d frames\n", xyuv.yuv[i].name, xyuv.yuv[i].i_frames );
+
+        if( xyuv.i_frames < xyuv.yuv[i].i_frames )
+            xyuv.i_frames = xyuv.yuv[i].i_frames;
+    }
+
+    if( xyuv.i_frames == 0 ) {
+        fprintf( stderr, "no frames to display\n" );
+    }
+
+    xyuv.pic = malloc( xyuv.i_frame_size );
+
+    /* calculate SDL view */
+    if( xyuv.i_wall_width > xyuv.i_yuv ) {
+        xyuv.i_wall_width = xyuv.i_yuv;
+    }
+    if( xyuv.i_wall_width == 0 ) {
+        while( xyuv.i_wall_width < xyuv.i_yuv && xyuv.i_wall_width * xyuv.i_wall_width < xyuv.i_yuv ) {
+            xyuv.i_wall_width++;
+        }
+    }
+
+    for( i = 0; i < xyuv.i_yuv; i++ ) {
+        if( xyuv.b_diff || xyuv.i_join > 0 ) {
+            xyuv.yuv[i].x = 0;
+            xyuv.yuv[i].y = 0;
+        } else if( xyuv.b_split ) {
+            xyuv.yuv[i].x = (i%xyuv.i_wall_width) * 3 * xyuv.i_width / 2;
+            xyuv.yuv[i].y = (i/xyuv.i_wall_width) * xyuv.i_height;
+        } else {
+            xyuv.yuv[i].x = (i%xyuv.i_wall_width) * xyuv.i_width;
+            xyuv.yuv[i].y = (i/xyuv.i_wall_width) * xyuv.i_height;
+        }
+    }
+    if( xyuv.b_diff ) {
+        xyuv.i_sdl_width = 3 * xyuv.i_width / 2;
+        xyuv.i_sdl_height= xyuv.i_height;
+    } else if( xyuv.i_join > 0 ) {
+        xyuv.i_sdl_width = xyuv.i_width;
+        xyuv.i_sdl_height= xyuv.i_height;
+    } else if( xyuv.b_split ) {
+        xyuv.i_sdl_width = xyuv.i_wall_width * 3 * xyuv.i_width / 2;
+        xyuv.i_sdl_height= xyuv.i_height * ( ( xyuv.i_yuv  + xyuv.i_wall_width - 1 ) / xyuv.i_wall_width );
+    } else {
+        xyuv.i_sdl_width = xyuv.i_wall_width * xyuv.i_width;
+        xyuv.i_sdl_height= xyuv.i_height * ( ( xyuv.i_yuv  + xyuv.i_wall_width - 1 ) / xyuv.i_wall_width );
+    }
+    xyuv.i_display_width = xyuv.i_sdl_width;
+    xyuv.i_display_height = xyuv.i_sdl_height;
+
+    /* Open SDL */
+    if( SDL_Init( SDL_INIT_EVENTTHREAD|SDL_INIT_NOPARACHUTE|SDL_INIT_VIDEO) ) {
+        fprintf( stderr, "cannot init SDL\n" );
+        return -1;
+    }
+
+    SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, 100 );
+    SDL_EventState( SDL_KEYUP, SDL_IGNORE );
+
+    xyuv.screen = SDL_SetVideoMode( xyuv.i_sdl_width, xyuv.i_sdl_height, 0,
+                                    SDL_HWSURFACE|SDL_RESIZABLE|
+                                    SDL_ASYNCBLIT|SDL_HWACCEL );
+    if( xyuv.screen == NULL ) {
+        fprintf( stderr, "SDL_SetVideoMode failed\n" );
+        return -1;
+    }
+
+    SDL_LockSurface( xyuv.screen );
+    xyuv.overlay = SDL_CreateYUVOverlay( xyuv.i_sdl_width, xyuv.i_sdl_height,
+                                         SDL_YV12_OVERLAY,
+                                         xyuv.screen );
+    /* reset with black */
+    memset( xyuv.overlay->pixels[0],   0, xyuv.overlay->pitches[0] * xyuv.i_sdl_height );
+    memset( xyuv.overlay->pixels[1], 128, xyuv.overlay->pitches[1] * xyuv.i_sdl_height / 2);
+    memset( xyuv.overlay->pixels[2], 128, xyuv.overlay->pitches[2] * xyuv.i_sdl_height / 2);
+    SDL_UnlockSurface( xyuv.screen );
+
+    if( xyuv.overlay == NULL ) {
+        fprintf( stderr, "recon: SDL_CreateYUVOverlay failed\n" );
+        return -1;
+    }
+
+    for( ;; ) {
+        SDL_Event event;
+        int64_t i_start = SDL_GetTicks();
+        int i_wait;
+
+        if( !xyuv.b_pause ) {
+            xyuv_display( &xyuv, xyuv.i_frame );
+        }
+
+        for( ;; ) {
+            int b_refresh = 0;
+            while( SDL_PollEvent( &event ) )  {
+                switch( event.type )
+                {
+                    case SDL_QUIT:
+                        exit( 1 );
+
+                    case SDL_KEYDOWN:
+                        switch( event.key.keysym.sym )
+                        {
+                            case SDLK_q:
+                            case SDLK_ESCAPE:
+                                exit(1);
+
+                            case SDLK_f:
+                                SDL_WM_ToggleFullScreen( xyuv.screen );
+                                break;
+
+                            case SDLK_g:
+                                if( xyuv.b_grid )
+                                    xyuv.b_grid = 0;
+                                else
+                                    xyuv.b_grid = 1;
+                                if( xyuv.b_pause )
+                                    b_refresh = 1;
+                                break;
+
+                            case SDLK_SPACE:
+                                if( xyuv.b_pause )
+                                    xyuv.b_pause = 0;
+                                else
+                                    xyuv.b_pause = 1;
+                                break;
+                            case SDLK_LEFT:
+                                if( xyuv.i_frame > 1 ) xyuv.i_frame--;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_RIGHT:
+                                if( xyuv.i_frame < xyuv.i_frames ) xyuv.i_frame++;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_HOME:
+                                xyuv.i_frame = 1;
+                                if( xyuv.b_pause )
+                                    b_refresh = 1;
+                                break;
+
+                            case SDLK_END:
+                                xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_UP:
+                                xyuv.i_frame += xyuv.i_frames / 20;
+                                if( xyuv.i_frame > xyuv.i_frames )
+                                    xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_DOWN:
+                                xyuv.i_frame -= xyuv.i_frames / 20;
+                                if( xyuv.i_frame < 1 )
+                                    xyuv.i_frame = 1;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_PAGEUP:
+                                xyuv.i_frame += xyuv.i_frames / 10;
+                                if( xyuv.i_frame > xyuv.i_frames )
+                                    xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_PAGEDOWN:
+                                xyuv.i_frame -= xyuv.i_frames / 10;
+                                if( xyuv.i_frame < 1 )
+                                    xyuv.i_frame = 1;
+                                b_refresh = 1;
+                                break;
+
+                            default:
+                                break;
+                        }
+                        break;
+                    case SDL_VIDEORESIZE:
+                        xyuv.i_display_width = event.resize.w;
+                        xyuv.i_display_height = event.resize.h;
+                        xyuv.screen = SDL_SetVideoMode( xyuv.i_display_width, xyuv.i_display_height, 0,
+                                                        SDL_HWSURFACE|SDL_RESIZABLE|
+                                                        SDL_ASYNCBLIT|SDL_HWACCEL );
+                        xyuv_display( &xyuv, xyuv.i_frame );
+                        break;
+
+                    default:
+                        break;
+                }
+            }
+            if( b_refresh ) {
+                xyuv.b_pause = 1;
+                xyuv_display( &xyuv, xyuv.i_frame );
+            }
+            /* wait */
+            i_wait = 1000 / xyuv.f_fps - ( SDL_GetTicks() - i_start);
+            if( i_wait < 0 )
+                break;
+            else if( i_wait > 200 )
+                SDL_Delay( 200 );
+            else {
+                SDL_Delay( i_wait );
+                break;
+            }
+        }
+        if( !xyuv.b_pause ) {
+            /* next frame */
+            if( xyuv.i_frame == xyuv.i_frames )
+                xyuv.b_pause = 1;
+            else if( xyuv.i_frame < xyuv.i_frames )
+                xyuv.i_frame++;
+        }
+    }
+
+
+    return 0;
+
+err_missing_arg:
+    fprintf( stderr, "missing arg for option=%s\n", argv[i] );
+    return -1;
+}
+
+
+static void xyuv_display( xyuv_t *xyuv, int i_frame )
+{
+    SDL_Rect rect;
+    int i_picture = 0;
+    int i;
+
+    if( i_frame > xyuv->i_frames )
+        return;
+
+    xyuv->i_frame = i_frame;
+
+    /* Load and copy pictue data */
+    for( i = 0; i < xyuv->i_yuv; i++ ) {
+        int i_plane;
+
+        if( i_frame - 1 >= xyuv->yuv[i].i_frames )
+            continue;
+        i_picture++;
+
+        fseek( xyuv->yuv[i].f, (xyuv->i_frame-1) * xyuv->i_frame_size, SEEK_SET );
+        fread( xyuv->pic, xyuv->i_frame_size, 1, xyuv->yuv[i].f );
+
+        SDL_LockYUVOverlay( xyuv->overlay );
+
+        if( xyuv->b_diff || xyuv->b_split ) {
+            /* Reset UV */
+            for( i_plane = 1; i_plane < 3; i_plane++ ) {
+                memset( xyuv->overlay->pixels[i_plane], 128, xyuv->overlay->pitches[i_plane] * xyuv->overlay->h / 2 );
+            }
+            /* Show diff in Y plane of overlay */
+
+            for( i_plane = 0; i_plane < 3; i_plane++ ) {
+                int div = i_plane == 0 ? 1 : 2;
+                uint8_t *src = xyuv->pic;
+                uint8_t *dst = xyuv->overlay->pixels[0] +
+                                (xyuv->yuv[i].x + xyuv->yuv[i].y * xyuv->overlay->pitches[0] );
+                int j;
+                if( i_plane == 1 ) {
+                    src +=  5*xyuv->i_width * xyuv->i_height/4;
+                    dst += xyuv->i_width;
+                } else if( i_plane == 2 ) {
+                    src += xyuv->i_width * xyuv->i_height;
+                    dst += xyuv->i_width + xyuv->i_height / 2 * xyuv->overlay->pitches[0];
+                }
+
+                for( j = 0; j < xyuv->i_height / div; j++ ) {
+                    if( i_picture == 1 || xyuv->b_split ) {
+                        memcpy( dst, src, xyuv->i_width / div );
+                    } else {
+                        int k;
+                        for( k = 0; k < xyuv->i_width / div; k++ ) {
+                            dst[k] = abs( dst[k] - src[k]);
+                        }
+                    }
+                    src += xyuv->i_width / div;
+                    dst += xyuv->overlay->pitches[0];
+                }
+            }
+        } else {
+            for( i_plane = 0; i_plane < 3; i_plane++ ) {
+                int div = i_plane == 0 ? 1 : 2;
+                uint8_t *src = xyuv->pic;
+                uint8_t *dst = xyuv->overlay->pixels[i_plane] +
+                                ((xyuv->yuv[i].x + xyuv->yuv[i].y * xyuv->overlay->pitches[i_plane] ) / div );
+                int w = xyuv->i_width / div;
+                int j;
+
+                if( i_plane == 1 ) {
+                    src +=  5*xyuv->i_width * xyuv->i_height/4;
+                } else if( i_plane == 2 ) {
+                    src += xyuv->i_width * xyuv->i_height;
+                }
+                if( xyuv->i_join > 0 ) {
+                    if( i_picture > 1 ) {
+                        src += xyuv->i_join / div;
+                        dst += xyuv->i_join / div;
+                        w = (xyuv->i_width - xyuv->i_join) /div;
+                    } else {
+                        w = xyuv->i_join / div;
+                    }
+                }
+
+                for( j = 0; j < xyuv->i_height / div; j++ ) {
+                    memcpy( dst, src, w );
+                    src += xyuv->i_width / div;
+                    dst += xyuv->overlay->pitches[i_plane];
+                }
+            }
+        }
+
+        SDL_UnlockYUVOverlay( xyuv->overlay );
+    }
+
+    if( xyuv->f_y != 0.0 ) {
+        uint8_t *pix = xyuv->overlay->pixels[0];
+        int j;
+
+        for( j = 0; j < xyuv->i_sdl_height; j++ ) {
+            int k;
+            for( k = 0; k < xyuv->i_sdl_width; k++ ) {
+                int v= pix[k] * xyuv->f_y;
+                if( v > 255 )
+                    pix[k] = 255;
+                else if( v < 0 )
+                    pix[k] = 0;
+                else
+                    pix[k] = v;
+            }
+            pix += xyuv->overlay->pitches[0];
+        }
+    }
+    if( xyuv->b_grid ) {
+        int x, y;
+
+        for( y = 0; y < xyuv->i_sdl_height; y += 4 ) {
+            uint8_t *p = xyuv->overlay->pixels[0] + y * xyuv->overlay->pitches[0];
+            for( x = 0; x < xyuv->i_sdl_width; x += 4 ) {
+                if( x%16== 0 || y%16 == 0 )
+                    p[x] = 0;
+            }
+        }
+    }
+
+    /* Update display */
+    rect.x = 0;
+    rect.y = 0;
+    rect.w = xyuv->i_display_width;
+    rect.h = xyuv->i_display_height;
+    SDL_DisplayYUVOverlay( xyuv->overlay, &rect );
+
+    /* Display title */
+    if( xyuv->title )
+        free( xyuv->title );
+    asprintf( &xyuv->title, SDL_TITLE, xyuv->yuv[0].name, xyuv->i_frame, xyuv->i_frames, xyuv->f_fps );
+    SDL_WM_SetCaption( xyuv->title, "" );
+}
+
+
+
+
diff --git a/vfw/build/cygwin/Makefile b/vfw/build/cygwin/Makefile
new file mode 100644
index 00000000..e5f37468
--- /dev/null
+++ b/vfw/build/cygwin/Makefile
@@ -0,0 +1,117 @@
+##############################################################################
+#
+# Makefile for x264 VFW driver
+#
+# Author: XviD project:
+#            - ??? <cutka at szm.sk>,
+#            - Edouard Gomez <ed.gomez at free.fr>
+#            - Peter Ross <pross@xvid.org>
+# Ported to x264 by Laurent Aimar <fenrir@via.ecp.fr>
+#
+# $Id: Makefile,v 1.1 2004/06/03 19:29:33 fenrir Exp $
+##############################################################################
+
+# Dll to build
+DLL=x264vfw.dll
+
+# Current dir
+DIR_CUR=$(shell pwd)
+
+# Path to include filen library and src
+DIR_INC=$(DIR_CUR)/../../..
+DIR_LIB=$(DIR_CUR)/../../..
+DIR_SRC=$(DIR_CUR)/../..
+
+# Sources
+SRC_C= codec.c config.c driverproc.c
+SRC_RES= resource.rc
+
+# Alias
+RM= rm -rf
+WINDRES=windres
+
+##############################################################################
+# CFLAGS
+##############################################################################
+
+# Constants which should not be modified
+# The `mingw-runtime` package is required when building with -mno-cygwin
+CFLAGS += -I$(DIR_SRC)/w32api -I$(DIR_INC)
+CFLAGS += -D_WIN32_IE=0x0500
+CFLAGS += -mno-cygwin
+
+# Optional Compiler options
+CFLAGS += -Wall
+CFLAGS += -O2
+CFLAGS += -fstrength-reduce
+CFLAGS += -finline-functions
+CFLAGS += -fgcse
+CFLAGS += -freduce-all-givs
+CFLAGS += -ffast-math
+
+##############################################################################
+# Compiler flags for linking stage
+##############################################################################
+
+LDFLAGS += -L$(DIR_LIB) -lx264
+
+##############################################################################
+# Rules
+##############################################################################
+
+OBJECTS = $(SRC_C:.c=.obj)
+OBJECTS+= $(SRC_RES:.rc=.obj)
+
+.SUFFIXES: .obj .rc .c
+
+DIR_BUILD= $(DIR_CUR)/bin
+VPATH = $(DIR_SRC):$(DIR_BUILD)
+
+all: $(DLL)
+
+$(DIR_BUILD):
+	@echo " D: $(DIR_BUILD)"
+	@mkdir -p $(DIR_BUILD)
+
+.rc.obj:
+	@echo " W: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(WINDRES) \
+	--include-dir=$(DIR_SRC) \
+	--input-format=rc \
+	--output-format=coff \
+	-o $(DIR_BUILD)/$@ $<
+
+.c.obj:
+	@echo " C: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(CC) $(CFLAGS) -c -o $(DIR_BUILD)/$@ $<
+
+$(DLL): $(DIR_BUILD) $(OBJECTS)
+	@echo " L: $(@F)"
+	@cp $(DIR_SRC)/driverproc.def $(DIR_BUILD)/driverproc.def
+	@cd $(DIR_BUILD) && \
+	$(CC) \
+	-mno-cygwin -shared -Wl,-dll,--out-implib,$@.a,--enable-stdcall-fixup \
+	-o $@ \
+	$(OBJECTS) driverproc.def \
+	-lgdi32 -lwinmm -lcomdlg32 -lcomctl32 $(LDFLAGS) 
+
+clean:
+	@echo " Cl: Object files and target lib"
+	@$(RM) $(DIR_BUILD)
+
+install:
+	@echo " I: x264vfw.dll"
+	@cp $(DIR_BUILD)/$(DLL) $(DLL)
+	@cp $(DIR_SRC)/build/win32/bin/x264vfw.inf .
+	@rundll32.exe setupapi,InstallHinfSection DefaultInstall 132 ./x264vfw.inf
+	@rm $(DLL)
+	@rm x264vfw.inf
+
+uninstall:
+	@echo " U: x264vfw.dll"
+	@cp $(DIR_SRC)/build/win32/bin/x264vfw.inf .
+	@rundll32.exe setupapi,InstallHinfSection Remove_x264 132 ./x264vfw.inf
+	@rm x264vfw.inf
+
diff --git a/vfw/build/win32/bin/x264vfw.inf b/vfw/build/win32/bin/x264vfw.inf
new file mode 100644
index 00000000..d4ff64a1
--- /dev/null
+++ b/vfw/build/win32/bin/x264vfw.inf
@@ -0,0 +1,91 @@
+; x264 Codec install
+
+[Version]
+Signature = "$CHICAGO$"
+Class = MEDIA
+
+[SourceDisksNames]
+1="x264 Codec Install Disk",, 0001
+
+[SourceDisksFiles]
+x264vfw.dll=1
+x264vfw.inf=1
+
+[Installable.Drivers]
+x264 = 1:x264vfw.dll, "vidc.X264", "x264 H.264 Video Codec" , , ,
+
+[DefaultInstall]
+CopyFiles=H264.Copy.Inf,H264.Copy
+Updateinis = H264.Updateini
+DelReg = H264.DelConfig
+addreg = H264.AddReg,H264.AddReg9x,H264.DoReg
+MediaType = SOFTWARE
+
+[DefaultInstall.ntx86]
+CopyFiles=H264.Copy.Inf,H264.Copy
+DelReg = H264.DelConfig
+addreg = H264.AddReg,H264.AddRegNT,H264.DoReg
+MediaType = SOFTWARE
+
+[Remove_x264]
+AddReg = H264.Unregister
+DelReg = H264.DelReg
+DelFiles = H264.Copy,H264.Copy.Inf
+UpdateInis = H264.DelIni
+
+[H264.Copy]
+x264vfw.dll
+
+[H264.Copy.Inf]
+x264vfw.inf
+
+[H264.UpdateIni]
+system.ini, drivers32,,"vidc.X264=x264vfw.dll"
+
+[H264.DelIni]
+system.ini, drivers32,"vidc.X264=x264vfw.dll",
+
+[H264.AddReg]
+
+[H264.AddReg9x]
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,Description,,%x264%
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,Driver,,x264vfw.dll
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,FriendlyName,,"x264"
+
+HKLM,%UnInstallPath%,DisplayName,,%UninstallDispName%
+HKLM,%UnInstallPath%,UninstallString,,"%10%\rundll.exe setupx.dll,InstallHinfSection Remove_x264 132 %17%\%InfFile%"
+
+[H264.AddRegNT]
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers.desc,x264vfw.dll,,%x264%
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers32,vidc.X264,,x264vfw.dll
+
+HKLM,%UnInstallPath%,DisplayName,,%UninstallDispName%
+HKLM,%UnInstallPath%,UninstallString,,"%11%\rundll32.exe setupapi,InstallHinfSection Remove_x264 132 %17%\%InfFile%"
+
+[H264.DoReg]
+;HKLM,Software\Microsoft\Windows\CurrentVersion\RunOnce\Setup,"Registering x264 Direct Show ;Decoder...",,"%11%\regsvr32.exe /s %11%\x264.ax"
+
+[H264.DelReg]
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264
+
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers.desc,x264vfw.dll,,""
+HKLM,%UnInstallPath%
+
+[H264.Unregister]
+;HKLM,Software\Microsoft\Windows\CurrentVersion\RunOnce\Setup,"Unregistering x264 Direct Show ;Decoder...",,"%11%\regsvr32.exe /s /u %11%\x264.ax"
+
+[H264.DelConfig]
+HKCU,Software\GNU\x264
+
+[DestinationDirs]
+DefaultDestDir = 11	; LDID_SYS
+H264.Copy = 11
+H264.Copy.Inf = 17
+
+[Strings]
+x264="x264 H.264 Video Codec"
+InfFile="x264vfw.inf"
+UninstallDispName="x264 H.264/AVC CODEC"
+UnInstallPath="Software\Microsoft\Windows\CurrentVersion\Uninstall\x264"
+MediaClassName="Media Devices"
+mfgname="Fenrir, Justin, CM"
diff --git a/vfw/build/win32/x264vfw.dsp b/vfw/build/win32/x264vfw.dsp
new file mode 100644
index 00000000..0fadf913
--- /dev/null
+++ b/vfw/build/win32/x264vfw.dsp
@@ -0,0 +1,135 @@
+# Microsoft Developer Studio Project File - Name="x264vfw" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Dynamic-Link Library" 0x0102
+
+CFG=x264vfw - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "x264vfw.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "x264vfw.mak" CFG="x264vfw - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "x264vfw - Win32 Release" (based on "Win32 (x86) Dynamic-Link Library")
+!MESSAGE "x264vfw - Win32 Debug" (based on "Win32 (x86) Dynamic-Link Library")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+MTL=midl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "x264vfw - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "obj/Release"
+# PROP Intermediate_Dir "obj/Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /c
+# ADD CPP /nologo /MT /W3 /GX /O2 /I "../../../extras" /I "../../.." /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /c
+# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32
+# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:I386
+# ADD LINK32 winmm.lib vfw32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:I386 /out:"bin/x264vfw.dll"
+
+!ELSEIF  "$(CFG)" == "x264vfw - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "obj/Debug"
+# PROP Intermediate_Dir "obj/Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /GZ /c
+# ADD CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /I "../../../extras" /I "../../.." /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /GZ /c
+# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
+# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 winmm.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /out:"bin/x264vfw.dll" /pdbtype:sept
+
+!ENDIF 
+
+# Begin Target
+
+# Name "x264vfw - Win32 Release"
+# Name "x264vfw - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\codec.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\config.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\driverproc.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\driverproc.def
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\resource.rc
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\resource.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\x264vfw.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# Begin Source File
+
+SOURCE=..\..\..\build\win32\bin\libx264.lib
+# End Source File
+# End Target
+# End Project
diff --git a/vfw/build/win32/x264vfw.dsw b/vfw/build/win32/x264vfw.dsw
new file mode 100644
index 00000000..7d24ef97
--- /dev/null
+++ b/vfw/build/win32/x264vfw.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "x264vfw"=.\x264vfw.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/vfw/codec.c b/vfw/codec.c
new file mode 100644
index 00000000..076d7962
--- /dev/null
+++ b/vfw/codec.c
@@ -0,0 +1,276 @@
+/*****************************************************************************
+ * codec.c: vfw x264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: codec.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "x264vfw.h"
+
+/* get_csp:
+ *  return a valid x264 CSP or X264_CSP_NULL if unsuported */
+static int get_csp( BITMAPINFOHEADER *hdr )
+{
+    int i_vlip = hdr->biHeight < 0 ? 0 : X264_CSP_VFLIP;
+
+    switch( hdr->biCompression )
+    {
+        case FOURCC_I420:
+        case FOURCC_IYUV:
+            return X264_CSP_I420;
+
+        case FOURCC_YV12:
+            return X264_CSP_YV12;
+
+        case FOURCC_YUYV:
+        case FOURCC_YUY2:
+            return X264_CSP_YUYV;
+
+        case BI_RGB:
+            if( hdr->biBitCount == 24 )
+                return X264_CSP_BGR | i_vlip;
+            if( hdr->biBitCount == 32 )
+                return X264_CSP_BGRA | i_vlip;
+            else
+                return X264_CSP_NONE;
+
+        default:
+            return X264_CSP_NONE;
+    }
+}
+
+/* Test that we can do the compression */
+LRESULT compress_query( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    BITMAPINFOHEADER *inhdr = &lpbiInput->bmiHeader;
+    BITMAPINFOHEADER *outhdr = &lpbiOutput->bmiHeader;
+    CONFIG           *config = &codec->config;
+
+    if( get_csp( inhdr ) == X264_CSP_NONE )
+        return ICERR_BADFORMAT;
+
+    if( lpbiOutput == NULL )
+        return ICERR_OK;
+
+    if( inhdr->biWidth != outhdr->biWidth ||
+        inhdr->biHeight != outhdr->biHeight )
+        return ICERR_BADFORMAT;
+
+    /* We need x16 width/height */
+    if( inhdr->biWidth % 16 != 0 || inhdr->biHeight % 16 != 0 )
+        return ICERR_BADFORMAT;
+
+
+    if( inhdr->biCompression != mmioFOURCC( config->fcc[0], config->fcc[1],
+                                            config->fcc[2], config->fcc[3] ) )
+        return ICERR_BADFORMAT;
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_get_format( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    BITMAPINFOHEADER *inhdr = &lpbiInput->bmiHeader;
+    BITMAPINFOHEADER *outhdr = &lpbiOutput->bmiHeader;
+    CONFIG           *config = &codec->config;
+
+    if( get_csp( inhdr ) == X264_CSP_NONE )
+        return ICERR_BADFORMAT;
+
+    if( lpbiOutput == NULL )
+        return sizeof(BITMAPINFOHEADER);
+
+    memcpy( outhdr, inhdr, sizeof( BITMAPINFOHEADER ) );
+    outhdr->biSize = sizeof( BITMAPINFOHEADER );
+    outhdr->biSizeImage = compress_get_size( codec, lpbiInput, lpbiOutput );
+    outhdr->biXPelsPerMeter = 0;
+    outhdr->biYPelsPerMeter = 0;
+    outhdr->biClrUsed = 0;
+    outhdr->biClrImportant = 0;
+    outhdr->biCompression = mmioFOURCC( config->fcc[0], config->fcc[1],
+                                        config->fcc[2], config->fcc[3] );
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_get_size( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    return 2 * lpbiOutput->bmiHeader.biWidth * lpbiOutput->bmiHeader.biHeight * 3;
+}
+
+/* */
+LRESULT compress_frames_info(CODEC * codec, ICCOMPRESSFRAMES * icf )
+{
+    codec->fincr = icf->dwScale;
+    codec->fbase = icf->dwRate;
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_begin(CODEC * codec, BITMAPINFO * lpbiInput, BITMAPINFO * lpbiOutput )
+{
+    CONFIG *config = &codec->config;
+    x264_param_t param;
+
+    /* Destroy previous handle */
+    if( codec->h != NULL )
+    {
+        x264_encoder_close( codec->h );
+        codec->h = NULL;
+    }
+
+    /* Get default param */
+    x264_param_default( &param );
+
+    /* Set params: TODO to complete */
+    param.i_width = lpbiInput->bmiHeader.biWidth;
+    param.i_height= lpbiInput->bmiHeader.biHeight;
+
+    if( codec->fbase > 0 )
+        param.f_fps   = (float)codec->fincr / (float)codec->fbase;
+
+    param.i_frame_reference = config->i_refmax;
+    param.i_idrframe = config->i_idrframe;
+    param.i_iframe   = config->i_iframe;
+    param.i_qp_constant = config->i_qp;
+    param.b_deblocking_filter = config->b_filter;
+    param.b_cabac = config->b_cabac;
+
+    param.analyse.intra = 0;
+    param.analyse.inter = 0;
+    if( config->b_psub16x16 )
+        param.analyse.inter |= X264_ANALYSE_PSUB16x16;
+    if( config->b_psub8x8 )
+        param.analyse.inter |= X264_ANALYSE_PSUB8x8;
+    if( config->b_i4x4 )
+    {
+        param.analyse.intra |= X264_ANALYSE_I4x4;
+        param.analyse.inter |= X264_ANALYSE_I4x4;
+    }
+
+    switch( config->mode )
+    {
+        case 0: /* 1 PASS */
+            break;
+        default:
+            break;
+    }
+
+    /* Open the encoder */
+    codec->h = x264_encoder_open( &param );
+    if( codec->h == NULL )
+        return ICERR_ERROR;
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_end(CODEC * codec)
+{
+    if( codec->h != NULL )
+    {
+        x264_encoder_close( codec->h );
+        codec->h = NULL;
+    }
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress( CODEC *codec, ICCOMPRESS *icc )
+{
+    BITMAPINFOHEADER *inhdr = icc->lpbiInput;
+    BITMAPINFOHEADER *outhdr = icc->lpbiOutput;
+
+    x264_picture_t pic;
+
+    int        i_nal;
+    x264_nal_t *nal;
+    int        i_out;
+
+    int i;
+
+    /* Init the picture */
+    memset( &pic, 0, sizeof( x264_picture_t ) );
+    pic.img.i_csp = get_csp( inhdr );
+
+    /* For now biWidth can be divided by 16 so no problem */
+    switch( pic.img.i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+        case X264_CSP_YV12:
+            pic.img.i_plane = 3;
+            pic.img.i_stride[0] = inhdr->biWidth;
+            pic.img.i_stride[1] =
+            pic.img.i_stride[2] = inhdr->biWidth / 2;
+
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            pic.img.plane[1]    = pic.img.plane[0] + inhdr->biWidth * inhdr->biHeight;
+            pic.img.plane[2]    = pic.img.plane[1] + inhdr->biWidth * inhdr->biHeight / 4;
+            break;
+
+        case X264_CSP_YUYV:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 2 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        case X264_CSP_BGR:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 3 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        case X264_CSP_BGRA:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 4 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        default:
+            return ICERR_BADFORMAT;
+    }
+
+    /* encode it */
+    x264_encoder_encode( codec->h, &nal, &i_nal, &pic );
+
+    /* create bitstream */
+    i_out = 0;
+    for( i = 0; i < i_nal; i++ )
+    {
+        int i_size = outhdr->biSizeImage - i_out;
+        x264_nal_encode( (uint8_t*)icc->lpOutput + i_out, &i_size, 1, &nal[i] );
+
+        i_out += i_size;
+    }
+    outhdr->biSizeImage = i_out;
+
+    /* Set key frame only for IDR, as they are real synch point, I frame
+       aren't always synch point (ex: with multi refs, ref marking) */
+    if( pic.i_type == X264_TYPE_IDR )
+        *icc->lpdwFlags = AVIIF_KEYFRAME;
+    else
+        *icc->lpdwFlags = 0;
+
+    return ICERR_OK;
+}
+
diff --git a/vfw/config.c b/vfw/config.c
new file mode 100644
index 00000000..79f5a7ba
--- /dev/null
+++ b/vfw/config.c
@@ -0,0 +1,443 @@
+/*****************************************************************************
+ * config.c: vfw x264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: config.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/**************************************************************************
+ *
+ *  History:
+ *
+ *  2004.05.14  CBR encode mode support
+ *
+ **************************************************************************/
+
+#include "x264vfw.h"
+#include <stdio.h>  /* sprintf */
+#include <commctrl.h>
+
+/* Registry */
+#define X264_REG_KEY	HKEY_CURRENT_USER
+#define X264_REG_PARENT "Software\\GNU\\x264"
+#define X264_REG_CHILD  "x264"
+#define X264_REG_CLASS  "config"
+
+/* window controls */
+#define BITRATE_MAX		5000
+#define QUANT_MAX		51
+
+/* description */
+#define X264_NAME		"x264"
+#define X264_DEF_TEXT	"Are you sure you want to load default vaules"
+
+/* Registery handling */
+typedef struct
+{
+    char *reg_value;
+    int  *config_int;
+    int  default_int;
+} reg_int_t;
+
+typedef struct
+{
+    char *reg_value;
+    char *config_str;
+    char *default_str;
+} reg_str_t;
+
+CONFIG reg;
+static const reg_int_t reg_int_table[] =
+{
+    /* Main dialog */
+    { "bitrate",        &reg.bitrate,           800 },
+    { "quantizer",      &reg.i_qp,              26 },
+    { "encoding_type",  &reg.i_encoding_type,   1 },
+
+    /* Advance dialog */
+    { "cabac",          &reg.b_cabac,           1 },
+    { "loop_filter",    &reg.b_filter,          1 },
+    { "idrframe",       &reg.i_idrframe,        1 },
+    { "iframe",         &reg.i_iframe,          150 },
+    { "refmax",         &reg.i_refmax,          1 },
+
+    /* analysis */
+    {"i4x4",            &reg.b_i4x4,            1 },
+    {"psub16x16",       &reg.b_psub16x16,       1 },
+    {"psub8x8",         &reg.b_psub8x8,         1 }
+};
+
+static const reg_str_t reg_str_table[] =
+{
+    { "fourcc",         reg.fcc,                "x264" }
+};
+
+void config_reg_load( CONFIG *config )
+{
+    HKEY    hKey;
+    DWORD   i_size;
+    int     i;
+
+    RegOpenKeyEx( X264_REG_KEY, X264_REG_PARENT "\\" X264_REG_CHILD,
+                  0, KEY_READ, &hKey );
+
+    /* Read all integers */
+    for( i = 0; i < sizeof( reg_int_table )/sizeof( reg_int_t); i++ )
+    {
+        i_size = sizeof( int );
+        if( RegQueryValueEx( hKey, reg_int_table[i].reg_value, 0, 0,
+                             (LPBYTE)reg_int_table[i].config_int,
+                             &i_size ) != ERROR_SUCCESS )
+            *reg_int_table[i].config_int = reg_int_table[i].default_int;
+    }
+
+    /* Read strings */
+    for( i = 0; i < sizeof( reg_str_table )/sizeof( reg_str_t); i++ )
+    {
+        i_size = 5;   /* fourcc + 1 FIXME ugly */
+        if( RegQueryValueEx( hKey, reg_str_table[i].reg_value, 0, 0,
+                             (LPBYTE)reg_str_table[i].config_str,
+                             &i_size ) != ERROR_SUCCESS )
+            memcpy( reg_str_table[i].config_str,
+                    reg_str_table[i].default_str, 5 );
+    }
+
+    RegCloseKey( hKey );
+
+    memcpy( config, &reg, sizeof( CONFIG ) );
+}
+
+void config_reg_save( CONFIG *config )
+{
+    HKEY    hKey;
+    DWORD   i_size;
+    int     i;
+
+    if( RegCreateKeyEx( X264_REG_KEY,
+                        X264_REG_PARENT "\\" X264_REG_CHILD,
+                        0,
+                        X264_REG_CLASS,
+                        REG_OPTION_NON_VOLATILE,
+                        KEY_WRITE,
+                        0, &hKey, &i_size ) != ERROR_SUCCESS )
+        return;
+
+    memcpy( &reg, config, sizeof( CONFIG ) );
+
+    /* Save all integers */
+    for( i = 0; i < sizeof( reg_int_table )/sizeof( reg_int_t); i++ )
+    {
+        RegSetValueEx( hKey, reg_int_table[i].reg_value, 0, REG_DWORD,
+                       (LPBYTE)reg_int_table[i].config_int, sizeof( int ) );
+    }
+
+    /* Save strings */
+    for( i = 0; i < sizeof( reg_str_table )/sizeof( reg_str_t); i++ )
+    {
+        RegSetValueEx( hKey, reg_str_table[i].reg_value, 0, REG_SZ,
+                       (LPBYTE)reg_str_table[i].config_str,
+                        5 );    /* FIXME */
+    }
+
+    RegCloseKey( hKey );
+}
+
+/* config_reg_defaults: */
+void config_reg_defaults( CONFIG *config )
+{
+    HKEY hKey;
+
+    /* Just in case */
+    memset( config, 0, sizeof( CONFIG ) );
+
+    if(RegOpenKeyEx( X264_REG_KEY, X264_REG_PARENT, 0, KEY_ALL_ACCESS, &hKey ))
+        return;
+    if( RegDeleteKey( hKey, X264_REG_CHILD ) )
+        return;
+    RegCloseKey( hKey );
+
+    config_reg_load( config );
+    config_reg_save( config );
+}
+
+/* Enables or Disables Window Elements based on Selection
+ */
+static void main_enable_item( HWND hDlg, CONFIG * config )
+{
+    switch( config->i_encoding_type )
+    {
+    case 0 : /* 1 Pass, Bitrate Based */
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATEEDIT ), TRUE );
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATESLIDER ), TRUE );
+
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTEDIT ), FALSE );
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTSLIDER ), FALSE );
+
+        break;
+    case 1 : /* 1 Pass, Quantizer Based */
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATEEDIT ), FALSE );
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATESLIDER ), FALSE );
+
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTEDIT ), TRUE );
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTSLIDER ), TRUE );
+
+        break;
+    case 2 : /* 2 Pass */
+        /* not yet implemented */
+        break;
+    }
+
+    SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETRANGE, TRUE,
+                        (LPARAM) MAKELONG( 0, BITRATE_MAX ) );
+    SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETRANGE, TRUE,
+                        (LPARAM) MAKELONG( 0, QUANT_MAX ) );
+}
+
+/* Updates the window from config */
+static void main_update_dlg( HWND hDlg, CONFIG * config )
+{
+    SetDlgItemInt( hDlg, IDC_BITRATEEDIT, config->bitrate, FALSE );
+    SetDlgItemInt( hDlg, IDC_QUANTEDIT, config->i_qp, FALSE );
+
+    switch( config->i_encoding_type )
+    {
+    case 0 : /* 1 Pass, Bitrate Based */
+        CheckRadioButton( hDlg,
+                          IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOBITRATE);
+        break;
+    case 1 : /* 1 Pass, Quantizer Based */
+        CheckRadioButton(hDlg,
+                         IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOQUANT);
+            break;
+    case 2 : /* 2 Pass */
+        CheckRadioButton(hDlg,
+                         IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOTWOPASS);
+        break;
+    }
+
+    SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETPOS, TRUE,
+                        config->bitrate );
+    SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETPOS, TRUE,
+                        config->i_qp );
+}
+
+
+/* Main config dialog */
+BOOL CALLBACK callback_main( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    CONFIG* config = (CONFIG*)GetWindowLong(hDlg, GWL_USERDATA);
+
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+        SetWindowLong( hDlg, GWL_USERDATA, lParam );
+        config = (CONFIG*)lParam;
+
+        main_enable_item( hDlg, config );
+        main_update_dlg( hDlg, config );
+
+        break;
+
+    case WM_COMMAND:
+        switch ( HIWORD( wParam ) )
+        {
+        case BN_CLICKED :
+            switch( LOWORD( wParam ) )
+            {
+            case IDOK :
+                config->b_save = TRUE;
+                EndDialog( hDlg, LOWORD(wParam) );
+                break;
+            case IDCANCEL :
+                config->b_save = FALSE;
+                EndDialog( hDlg, LOWORD(wParam) );
+                break;
+            case IDC_ADVANCED :
+                DialogBoxParam( g_hInst, MAKEINTRESOURCE(IDD_ADVANCED),
+                                (HWND)lParam, callback_advanced,
+                                (LPARAM)config );
+                break;
+            case IDC_DEFAULTS :
+                if( MessageBox( hDlg, X264_DEF_TEXT, X264_NAME, MB_YESNO ) == IDYES )
+                {
+                    config_reg_defaults( config );
+                    main_enable_item( hDlg, config );
+                    main_update_dlg( hDlg, config );
+                }
+                break;
+            case IDC_RADIOBITRATE :
+                config->i_encoding_type = 0; /* 1 Pass, Bitrate Mode=0 */
+                main_enable_item( hDlg, config );
+                main_update_dlg( hDlg, config );
+                break;
+            case IDC_RADIOQUANT :
+                config->i_encoding_type = 1; /* 1 Pass, Quantizer Mode=1 */
+                main_enable_item( hDlg, config );
+                main_update_dlg( hDlg, config );
+                break;
+            case IDC_RADIOTWOPASS :
+                config->i_encoding_type = 2; /* 2 Pass Mode=2 */
+                main_enable_item( hDlg,  config );
+                main_update_dlg( hDlg, config );
+                break;
+            }
+            break;
+        case EN_CHANGE :
+            switch( LOWORD( wParam ) )
+            {
+            case IDC_BITRATEEDIT :
+                config->bitrate = GetDlgItemInt( hDlg, IDC_BITRATEEDIT, FALSE, FALSE );
+                SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETPOS, TRUE, config->bitrate );
+                break;
+            case IDC_QUANTEDIT :
+                config->i_qp = GetDlgItemInt( hDlg, IDC_QUANTEDIT, FALSE, FALSE );
+                SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETPOS, TRUE, config->i_qp );
+                break;
+            }
+            break;
+        default:
+            break;
+        }
+        break;
+
+        case WM_HSCROLL :
+            if( (HWND) lParam == GetDlgItem( hDlg, IDC_BITRATESLIDER ) )
+            {
+                config->bitrate = SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_GETPOS, 0, 0 );
+                SetDlgItemInt( hDlg, IDC_BITRATEEDIT, config->bitrate, FALSE );
+
+            }
+            else if( (HWND) lParam == GetDlgItem( hDlg, IDC_QUANTSLIDER ) )
+            {
+                config->i_qp = SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_GETPOS, 0, 0 );
+                SetDlgItemInt( hDlg, IDC_QUANTEDIT, config->i_qp, FALSE );
+            }
+            break;
+
+    default :
+        return 0;
+    }
+
+    return 1;
+}
+
+/* About dialog */
+BOOL CALLBACK callback_about( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+    {
+        char temp[1024];
+        sprintf( temp, "Core %d, build %s %s", X264_BUILD, __DATE__, __TIME__ );
+        SetDlgItemText( hDlg, IDC_BUILD,  temp );
+        break;
+    }
+
+    case WM_COMMAND:
+        if (LOWORD(wParam) == IDC_HOMEPAGE && HIWORD(wParam) == STN_CLICKED)
+            ShellExecute( hDlg, "open", X264_WEBSITE, NULL, NULL, SW_SHOWNORMAL );
+        else if (LOWORD(wParam) == IDOK || LOWORD(wParam) == IDCANCEL)
+            EndDialog( hDlg, LOWORD(wParam) );
+        break;
+
+    default :
+        return 0;
+    }
+
+    return 1;
+}
+
+static void adv_update_dlg( HWND hDlg, CONFIG * config )
+{
+    char fourcc[5];
+
+    CheckDlgButton( hDlg,IDC_CABAC,
+                    config->b_cabac ? BST_CHECKED : BST_UNCHECKED );
+    CheckDlgButton( hDlg,IDC_LOOPFILTER,
+                    config->b_filter ? BST_CHECKED: BST_UNCHECKED );
+
+    SetDlgItemInt( hDlg, IDC_IDRFRAMES, config->i_idrframe, FALSE );
+    SetDlgItemInt( hDlg, IDC_IFRAMES, config->i_iframe, FALSE );
+    SetDlgItemInt( hDlg, IDC_KEYFRAME, config->i_refmax, FALSE );
+
+    memcpy( fourcc, config->fcc, 4 );
+    fourcc[4] = '\0';
+
+    SetDlgItemText( hDlg, IDC_FOURCC, fourcc );
+}
+
+
+/* advanced configuration dialog process */
+BOOL CALLBACK callback_advanced( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    CONFIG* config = (CONFIG*)GetWindowLong(hDlg, GWL_USERDATA);
+
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+        SetWindowLong( hDlg, GWL_USERDATA, lParam );
+        config = (CONFIG*)lParam;
+
+        adv_update_dlg( hDlg, config );
+        break;
+
+    case WM_COMMAND:
+        switch ( HIWORD( wParam ) )
+        {
+        case BN_CLICKED :
+            switch( LOWORD( wParam ) )
+            {
+            case IDOK :
+                EndDialog( hDlg, LOWORD( wParam ) );
+                break;
+            case IDC_CABAC :
+                config->b_cabac = ( IsDlgButtonChecked( hDlg, IDC_CABAC ) == BST_CHECKED );
+                break;
+            case IDC_LOOPFILTER :
+                config->b_filter = ( IsDlgButtonChecked( hDlg, IDC_LOOPFILTER ) == BST_CHECKED );
+                break;
+            }
+            break;
+        case EN_CHANGE :
+            switch( LOWORD( wParam ) )
+            {
+            case IDC_IDRFRAMES :
+                config->i_idrframe = GetDlgItemInt( hDlg, IDC_IDRFRAMES, FALSE, FALSE );
+                break;
+            case IDC_IFRAMES :
+                config->i_iframe = GetDlgItemInt( hDlg, IDC_IFRAMES, FALSE, FALSE );
+                break;
+            case IDC_KEYFRAME :
+                config->i_refmax = GetDlgItemInt( hDlg, IDC_KEYFRAME, FALSE, FALSE );
+                break;
+            case IDC_FOURCC :
+                GetDlgItemText( hDlg, IDC_FOURCC, config->fcc, 5 );
+                break;
+            }
+            break;
+        }
+        break;
+    default :
+        return 0;
+    }
+    return 1;
+}
+
diff --git a/vfw/driverproc.c b/vfw/driverproc.c
new file mode 100644
index 00000000..4a050e68
--- /dev/null
+++ b/vfw/driverproc.c
@@ -0,0 +1,234 @@
+/*****************************************************************************
+ * drvproc.c: vfw x264 wrapper
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: driverproc.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "x264vfw.h"
+
+/* Global dll instance */
+HINSTANCE g_hInst;
+
+
+/* Calling back point for our DLL so we can keep track of the window in g_hInst */
+BOOL WINAPI DllMain( HANDLE hModule, DWORD ul_reason_for_call, LPVOID lpReserved )
+{
+    g_hInst = (HINSTANCE) hModule;
+    return TRUE;
+}
+
+/* This little puppy handles the calls which vfw programs send out to the codec */
+LRESULT WINAPI DriverProc( DWORD dwDriverId, HDRVR hDriver, UINT uMsg, LPARAM lParam1, LPARAM lParam2 )
+{
+    CODEC *codec = (CODEC *)dwDriverId;
+
+    switch( uMsg )
+    {
+        case DRV_LOAD:
+        case DRV_FREE:
+            return DRV_OK;
+
+        case DRV_OPEN:
+        {
+            ICOPEN *icopen = (ICOPEN *)lParam2;
+
+            if( icopen != NULL && icopen->fccType != ICTYPE_VIDEO )
+                return DRV_CANCEL;
+
+            if( ( codec = malloc( sizeof( CODEC ) ) ) == NULL )
+            {
+                if( icopen != NULL )
+                    icopen->dwError = ICERR_MEMORY;
+                return 0;
+            }
+
+            memset( codec, 0, sizeof( CODEC ) );
+            config_reg_load( &codec->config );
+            codec->h = NULL;
+
+            if( icopen != NULL )
+                icopen->dwError = ICERR_OK;
+            return (LRESULT)codec;
+        }
+
+        case DRV_CLOSE:
+            /* From xvid: compress_end/decompress_end don't always get called */
+            compress_end(codec);
+            free( codec );
+            return DRV_OK;
+
+        case DRV_DISABLE:
+        case DRV_ENABLE:
+            return DRV_OK;
+
+        case DRV_INSTALL:
+        case DRV_REMOVE:
+            return DRV_OK;
+
+        case DRV_QUERYCONFIGURE:
+        case DRV_CONFIGURE:
+            return DRV_CANCEL;
+
+        /* info */
+        case ICM_GETINFO:
+        {
+            ICINFO *icinfo = (ICINFO *)lParam1;
+
+            /* return a description */
+            icinfo->fccType      = ICTYPE_VIDEO;
+            icinfo->fccHandler   = FOURCC_X264;
+            icinfo->dwFlags      = VIDCF_COMPRESSFRAMES | VIDCF_FASTTEMPORALC;
+
+            icinfo->dwVersion    = 0;
+            icinfo->dwVersionICM = ICVERSION;
+
+            wcscpy( icinfo->szName, X264_NAME_L);
+            wcscpy( icinfo->szDescription, X264_DESC_L);
+
+            return lParam2; /* size of struct */
+        }
+
+        case ICM_ABOUT:
+            if( lParam1 != -1 )
+            {
+                DialogBoxParam(g_hInst, MAKEINTRESOURCE(IDD_ABOUT), (HWND)lParam1, callback_about, 0 );
+            }
+            return ICERR_OK;
+
+        case ICM_CONFIGURE:
+            if( lParam1 != -1 )
+            {
+                CONFIG temp;
+
+                codec->config.b_save = FALSE;
+			    memcpy( &temp, &codec->config, sizeof(CONFIG) );
+
+                DialogBoxParam( g_hInst, MAKEINTRESOURCE(IDD_MAINCONFIG), (HWND)lParam1, callback_main, (LPARAM)&temp );
+
+			    if( temp.b_save )
+			    {
+				    memcpy( &codec->config, &temp, sizeof(CONFIG) );
+                    config_reg_save( &codec->config );
+			    }
+            }
+            return ICERR_OK;
+
+        case ICM_GETSTATE:
+            if( (void*)lParam1 == NULL )
+            {
+                return sizeof( CONFIG );
+            }
+            memcpy( (void*)lParam1, &codec->config, sizeof( CONFIG ) );
+            return ICERR_OK;
+
+        case ICM_SETSTATE:
+            if( (void*)lParam1 == NULL )
+            {
+                config_reg_load( &codec->config );
+                return 0;
+            }
+            memcpy( &codec->config, (void*)lParam1, sizeof( CONFIG ) );
+            return 0;
+
+        /* not sure the difference, private/public data? */
+        case ICM_GET:
+        case ICM_SET:
+            return ICERR_OK;
+
+
+        /* older-stype config */
+        case ICM_GETDEFAULTQUALITY:
+        case ICM_GETQUALITY:
+        case ICM_SETQUALITY:
+        case ICM_GETBUFFERSWANTED:
+        case ICM_GETDEFAULTKEYFRAMERATE:
+            return ICERR_UNSUPPORTED;
+
+
+        /* compressor */
+        case ICM_COMPRESS_QUERY:
+            return compress_query(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_GET_FORMAT:
+            return compress_get_format(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_GET_SIZE:
+            return compress_get_size(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_FRAMES_INFO:
+            return compress_frames_info(codec, (ICCOMPRESSFRAMES *)lParam1);
+
+        case ICM_COMPRESS_BEGIN:
+            return compress_begin(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_END:
+            return compress_end(codec);
+
+        case ICM_COMPRESS:
+            return compress(codec, (ICCOMPRESS *)lParam1);
+
+        /* decompressor : not implemented */
+        case ICM_DECOMPRESS_QUERY:
+        case ICM_DECOMPRESS_GET_FORMAT:
+        case ICM_DECOMPRESS_BEGIN:
+        case ICM_DECOMPRESS_END:
+        case ICM_DECOMPRESS:
+        case ICM_DECOMPRESS_GET_PALETTE:
+        case ICM_DECOMPRESS_SET_PALETTE:
+        case ICM_DECOMPRESSEX_QUERY:
+        case ICM_DECOMPRESSEX_BEGIN:
+        case ICM_DECOMPRESSEX_END:
+        case ICM_DECOMPRESSEX:
+            return ICERR_UNSUPPORTED;
+
+#if 0
+        /* VFWEXT entry point : XXX what's that ? */
+        case ICM_USER+0x0fff :
+            if (lParam1 == VFWEXT_CONFIGURE_INFO) {
+                VFWEXT_CONFIGURE_INFO_T * info = (VFWEXT_CONFIGURE_INFO_T*)lParam2;
+                DPRINTF("%i %i %i %i %i %i",
+                    info->ciWidth, info->ciHeight,
+                    info->ciRate, info->ciScale,
+                    info->ciActiveFrame, info->ciFrameCount);
+
+                codec->config.ci_valid = 1;
+                memcpy(&codec->config.ci, (void*)lParam2, sizeof(VFWEXT_CONFIGURE_INFO_T));
+                return ICERR_OK;
+            }
+            return ICERR_UNSUPPORTED;
+#endif
+
+        default:
+            return DefDriverProc( dwDriverId, hDriver, uMsg, lParam1, lParam2 );
+    }
+}
+
+void WINAPI Configure(HWND hwnd, HINSTANCE hinst, LPTSTR lpCmdLine, int nCmdShow)
+{
+    DWORD dwDriverId;
+
+    dwDriverId = DriverProc(0, 0, DRV_OPEN, 0, 0);
+    if (dwDriverId != (DWORD)NULL)
+    {
+        DriverProc(dwDriverId, 0, ICM_CONFIGURE, (LPARAM)GetDesktopWindow(), 0);
+        DriverProc(dwDriverId, 0, DRV_CLOSE, 0, 0);
+    }
+}
diff --git a/vfw/resource.h b/vfw/resource.h
new file mode 100644
index 00000000..df693805
--- /dev/null
+++ b/vfw/resource.h
@@ -0,0 +1,52 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Developer Studio generated include file.
+// Used by resource.rc
+//
+#define IDD_DIALOG1                     101
+#define IDD_MAINCONFIG                  101
+#define IDD_ADVANCED                    102
+#define IDD_ABOUT                       103
+#define IDC_BITRATESLIDER               1002
+#define IDC_BITRATEEDIT                 1003
+#define IDC_BITRATESLIDER2              1004
+#define IDC_QUANTSLIDER                 1004
+#define IDC_CABAC                       1005
+#define IDC_BITRATEEDIT2                1005
+#define IDC_QUANTEDIT                   1005
+#define IDC_LOOPFILTER                  1007
+#define IDC_BITRATELOW                  1009
+#define IDC_BITRATELOW2                 1010
+#define IDC_BITRATEHIGH                 1011
+#define IDC_BFRAMES                     1012
+#define IDC_BITRATEHIGH2                1012
+#define IDC_IDRFRAMES                   1012
+#define IDC_BFRAMES2                    1013
+#define IDC_IFRAMES                     1013
+#define IDC_EDIT3                       1014
+#define IDC_KEYFRAME                    1014
+#define IDC_DEFAULTS                    1016
+#define IDC_CHECK3                      1017
+#define IDC_ADVANCED                    1018
+#define IDC_RADIO1                      1022
+#define IDC_RADIOBITRATE                1022
+#define IDC_MODE                        1023
+#define IDC_RADIOQUALITY                1024
+#define IDC_RADIOQUANT                  1024
+#define IDC_RADIOTWOPASS                1026
+#define IDC_USEADVANCED                 1029
+#define IDC_ADVDEFAULTS                 1030
+#define IDC_HOMEPAGE                    1034
+#define IDC_X264                        1035
+#define IDC_BUILD                       1036
+#define IDC_FOURCC                      1039
+
+// Next default values for new objects
+// 
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        104
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1040
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
diff --git a/vfw/x264vfw.h b/vfw/x264vfw.h
new file mode 100644
index 00000000..36e36cf3
--- /dev/null
+++ b/vfw/x264vfw.h
@@ -0,0 +1,103 @@
+#ifndef _X264_VFW_H
+#define _X264_VFW_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <windows.h>
+#include <vfw.h>
+
+#include <x264.h>
+
+#include "resource.h"
+
+/* Name */
+#define X264_NAME_L     L"x264"
+#define X264_DESC_L     L"x264 - H264/AVC encoder"
+
+/* Codec fcc */
+#define FOURCC_X264 mmioFOURCC('X','2','6','4')
+
+/* yuv 4:2:0 planar */
+#define FOURCC_I420 mmioFOURCC('I','4','2','0')
+#define FOURCC_IYUV mmioFOURCC('I','Y','U','V')
+#define FOURCC_YV12 mmioFOURCC('Y','V','1','2')
+
+/* yuv 4:2:2 packed */
+#define FOURCC_YUY2 mmioFOURCC('Y','U','Y','2')
+#define FOURCC_YUYV mmioFOURCC('Y','U','Y','V')
+
+#define X264_WEBSITE	"http://lyra.via.ecp.fr/"
+
+/* CONFIG: vfw config
+ */
+typedef struct
+{
+    /********** ATTENTION **********/
+    int mode;                   /* Vidomi directly accesses these vars */
+    int bitrate;
+    int desired_size;           /* please try to avoid modifications here */
+    char stats[MAX_PATH];
+    /*******************************/
+
+    /* Our config */
+    int i_refmax;
+    int i_idrframe;
+    int i_iframe;
+
+    int i_qp;
+    int b_filter;
+
+    int b_cabac;
+
+    int b_i4x4;
+    int b_psub16x16;
+    int b_psub8x8;
+
+    /* vfw interface */
+    int b_save;
+    /* fourcc used */
+    char fcc[4+1];
+    int  i_encoding_type;
+} CONFIG;
+
+/* CODEC: vfw codec instance
+ */
+typedef struct
+{
+    CONFIG config;
+
+    /* handle */
+    x264_t *h;
+
+    /* XXX: needed ? */
+    unsigned int fincr;
+    unsigned int fbase;
+} CODEC;
+
+/* Compress functions */
+LRESULT compress_query(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_get_format(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_get_size(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_frames_info(CODEC *, ICCOMPRESSFRAMES *);
+LRESULT compress_begin(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_end(CODEC *);
+LRESULT compress(CODEC *, ICCOMPRESS *);
+
+
+/* config functions */
+void config_reg_load( CONFIG * config );
+void config_reg_save( CONFIG * config );
+
+
+/* Dialog callbacks */
+BOOL CALLBACK callback_about( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+BOOL CALLBACK callback_main ( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+BOOL CALLBACK callback_advanced( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+
+/* Dll instance */
+extern HINSTANCE g_hInst;
+
+#endif
+
diff --git a/x264.c b/x264.c
new file mode 100644
index 00000000..75c347b6
--- /dev/null
+++ b/x264.c
@@ -0,0 +1,558 @@
+/*****************************************************************************
+ * x264: h264 encoder/decoder testing program.
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: x264.c,v 1.1 2004/06/03 19:24:12 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <math.h>
+
+#include <signal.h>
+#define _GNU_SOURCE
+#include <getopt.h>
+
+#ifdef _MSC_VER
+#include <io.h>     /* _setmode() */
+#include <fcntl.h>  /* _O_BINARY */
+#endif
+
+#include "x264.h"
+#include "core/common.h"
+
+#define DATA_MAX 3000000
+uint8_t data[DATA_MAX];
+
+/* Ctrl-C handler */
+static int     i_ctrl_c = 0;
+static void    SigIntHandler( int a )
+{
+    i_ctrl_c = 1;
+}
+
+static void Help( void );
+static int  Parse( int argc, char **argv, x264_param_t  *param, FILE **p_fin, FILE **p_fout, int *pb_decompress );
+static int  Encode( x264_param_t  *param, FILE *fyuv,  FILE *fout );
+static int  Decode( x264_param_t  *param, FILE *fh26l, FILE *fout );
+
+/****************************************************************************
+ * main:
+ ****************************************************************************/
+int main( int argc, char **argv )
+{
+    x264_param_t param;
+
+    FILE    *fout;
+    FILE    *fin;
+
+    int     b_decompress;
+    int     i_ret;
+
+#ifdef _MSC_VER
+    _setmode(_fileno(stdin), _O_BINARY);    /* thanks to Marcos Morais <morais at dee.ufcg.edu.br> */
+    _setmode(_fileno(stdout), _O_BINARY);
+#endif
+
+    x264_param_default( &param );
+    param.f_fps = 25.0;
+
+    /* Parse command line */
+    if( Parse( argc, argv, &param, &fin, &fout, &b_decompress ) < 0 )
+    {
+        return -1;
+    }
+
+    /* Control-C handler */
+    signal( SIGINT, SigIntHandler );
+
+    if( b_decompress )
+        i_ret = Decode( &param, fin, fout );
+    else
+        i_ret = Encode( &param, fin, fout );
+
+    return i_ret;
+}
+
+/*****************************************************************************
+ * Help:
+ *****************************************************************************/
+static void Help( void )
+{
+    fprintf( stderr,
+             "x264 build:0x%4.4x\n"
+             "Syntax: x264 [options] [-o out.h26l] in.yuv widthxheigh\n"
+             "\n"
+             "  -h, --help                  Print this help\n"
+             "\n"
+             "  -I, --idrframe <integer>    Each 'number' I frames are IDR frames\n"
+             "  -i, --iframe <integer>      Frequency of I frames\n"
+             "  -b, --bframe <integer>      Number of B-frames between I and P\n"
+             "\n"
+             "  -c, --cabac                 Enable CABAC\n"
+             "  -r, --ref <integer>         Number of references\n"
+             "  -n, --nf                    Disable loop filter\n"
+             "  -f, --filter <alpha:beta>   Loop filter AplhaCO and Beta parameters\n"
+             "  -q, --qp <integer>          Set QP\n"
+             "  -B, --bitrate <integer>     Set bitrate [broken]\n"
+             "\n"
+             "  -A, --analyse <string>      Analyse options:\n"
+             "                                  - i4x4\n"
+             "                                  - psub16x16,psub8x8\n"
+             "                                  - none, all\n"
+             "\n"
+             "  -s, --sar width:height      Specify Sample Aspect Ratio\n"
+             "  -o, --output                Specify output file\n"
+             "\n"
+             "      --no-asm                Disable any CPU optims\n"
+             "\n",
+            X264_BUILD
+           );
+}
+
+/*****************************************************************************
+ * Parse:
+ *****************************************************************************/
+static int  Parse( int argc, char **argv,
+                   x264_param_t  *param,
+                   FILE **p_fin, FILE **p_fout, int *pb_decompress )
+{
+    char *psz_filename = NULL;
+
+    /* Default output */
+    *p_fout = stdout;
+    *p_fin  = stdin;
+    *pb_decompress = 0;
+
+    /* Parse command line options */
+    opterr = 0; // no error message
+    for( ;; )
+    {
+        int long_options_index;
+        static struct option long_options[] =
+        {
+            { "help",    no_argument,       NULL, 'h' },
+            { "bitrate", required_argument, NULL, 'B' },
+            { "bframe",  required_argument, NULL, 'b' },
+            { "iframe",  required_argument, NULL, 'i' },
+            { "idrframe",required_argument, NULL, 'I' },
+            { "nf",      no_argument,       NULL, 'n' },
+            { "filter",  required_argument, NULL, 'f' },
+            { "cabac",   no_argument,       NULL, 'c' },
+            { "qp",      required_argument, NULL, 'q' },
+            { "ref",     required_argument, NULL, 'r' },
+            { "no-asm",  no_argument,       NULL, 'C' },
+            { "sar",     required_argument, NULL, 's' },
+            { "output",  required_argument, NULL, 'o' },
+            { "analyse", required_argument, NULL, 'A' },
+            {0, 0, 0, 0}
+        };
+
+        int c;
+
+        c = getopt_long( argc, argv, "hi:I:b:r:cxB:q:no:s:A:",
+                         long_options, &long_options_index);
+
+        if( c == -1 )
+        {
+            break;
+        }
+
+        switch( c )
+        {
+            case 'h':
+                Help();
+                return -1;
+
+            case 0:
+                break;
+            case 'B':
+                param->i_bitrate = atol( optarg );
+                break;
+            case 'b':
+                param->i_bframe = atol( optarg );
+                break;
+            case 'i':
+                param->i_iframe = atol( optarg );
+                break;
+            case 'I':
+                param->i_idrframe = atol( optarg );
+                break;
+            case 'n':
+                param->b_deblocking_filter = 0;
+                break;
+            case 'f':
+            {
+                char *p = strchr( optarg, ':' );
+                if( p )
+                {
+                    param->i_deblocking_filter_alphac0 = atoi( optarg );
+                    param->i_deblocking_filter_beta = atoi( p );
+                }
+                break;
+            }
+            case 'q':
+                param->i_qp_constant = atoi( optarg );
+                break;
+            case 'r':
+                param->i_frame_reference = atoi( optarg );
+                break;
+            case 'c':
+                param->b_cabac = 1;
+                break;
+            case 'x':
+                *pb_decompress = 1;
+                break;
+            case 'C':
+                param->cpu = 0;
+                break;
+            case'o':
+                if( ( *p_fout = fopen( optarg, "wb" ) ) == NULL )
+                {
+                    fprintf( stderr, "cannot open output file `%s'\n", optarg );
+                    return -1;
+                }
+                break;
+            case 's':
+            {
+                char *p = strchr( optarg, ':' );
+                if( p )
+                {
+                    param->vui.i_sar_width = atoi( optarg );
+                    param->vui.i_sar_height = atoi( p + 1 );
+                }
+                break;
+            }
+            case 'A':
+                param->analyse.inter = 0;
+                if( strstr( optarg, "none" ) )  param->analyse.inter = 0x000000;
+                if( strstr( optarg, "all" ) )   param->analyse.inter = X264_ANALYSE_I4x4|X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8;
+
+                if( strstr( optarg, "i4x4" ) )      param->analyse.inter |= X264_ANALYSE_I4x4;
+                if( strstr( optarg, "psub16x16" ) ) param->analyse.inter |= X264_ANALYSE_PSUB16x16;
+                if( strstr( optarg, "psub8x8" ) )   param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+                break;
+
+            default:
+                fprintf( stderr, "unknown option (%c)\n", optopt );
+                return -1;
+        }
+    }
+
+    /* Get the file name */
+    if( optind > argc - 1 )
+    {
+        Help();
+        return -1;
+    }
+    psz_filename = argv[optind++];
+
+    if( !(*pb_decompress) )
+    {
+        char *psz_size = NULL;
+        char *p;
+
+
+        if( optind > argc - 1 )
+        {
+            char *psz = psz_filename;
+            char *x = NULL;
+            /* try to parse the file name */
+            while( *psz )
+            {
+                while( *psz && ( *psz < '0' || *psz > '9' ) ) psz++;
+                x = strchr( psz, 'x' );
+                if( !x )
+                    break;
+                if( ( x[1] >= '0' && x[1] <= '9' ) )
+                {
+                    psz_size = psz;
+                    break;
+                }
+            }
+            if( psz_size == NULL )
+            {
+                Help();
+                return -1;
+            }
+            fprintf( stderr, "x264: file name gives %dx%d\n", atoi(psz), atoi(x+1) );
+        }
+        else
+        {
+            psz_size = argv[optind++];
+        }
+
+        param->i_width           = strtol( psz_size, &p, 0 );
+        param->i_height          = strtol( p+1, &p, 0 );
+    }
+
+    /* open the input */
+    if( !strcmp( psz_filename, "-" ) )
+    {
+        *p_fin = stdin;
+        optind++;
+    }
+    else if( ( *p_fin = fopen( psz_filename, "rb" ) ) == NULL )
+    {
+        fprintf( stderr, "could not open input file '%s'\n", psz_filename );
+        return -1;
+    }
+
+    return 0;
+}
+
+/*****************************************************************************
+ * Decode:
+ *****************************************************************************/
+static int  Decode( x264_param_t  *param, FILE *fh26l, FILE *fout )
+{
+    fprintf( stderr, "decompressor not working (help is welcome)\n" );
+    return -1;
+#if 0
+    x264_nal_t nal;
+    int i_data;
+    int b_eof;
+
+    //param.cpu = 0;
+    if( ( h = x264_decoder_open( &param ) ) == NULL )
+    {
+        fprintf( stderr, "x264_decoder_open failed\n" );
+        return -1;
+    }
+
+    i_start = x264_mdate();
+    b_eof = 0;
+    i_frame = 0;
+    i_data  = 0;
+    nal.p_payload = malloc( DATA_MAX );
+
+    while( !i_ctrl_c )
+    {
+        uint8_t *p, *p_next, *end;
+        int i_size;
+        /* fill buffer */
+        if( i_data < DATA_MAX && !b_eof )
+        {
+            int i_read = fread( &data[i_data], 1, DATA_MAX - i_data, fh26l );
+            if( i_read <= 0 )
+            {
+                b_eof = 1;
+            }
+            else
+            {
+                i_data += i_read;
+            }
+        }
+
+        if( i_data < 3 )
+        {
+            break;
+        }
+
+        end = &data[i_data];
+
+        /* extract one nal */
+        p = &data[0];
+        while( p < end - 3 )
+        {
+            if( p[0] == 0x00 && p[1] == 0x00 && p[2] == 0x01 )
+            {
+                break;
+            }
+            p++;
+        }
+
+        if( p >= end - 3 )
+        {
+            fprintf( stderr, "garbage (i_data = %d)\n", i_data );
+            i_data = 0;
+            continue;
+        }
+
+        p_next = p + 3;
+        while( p_next < end - 3 )
+        {
+            if( p_next[0] == 0x00 && p_next[1] == 0x00 && p_next[2] == 0x01 )
+            {
+                break;
+            }
+            p_next++;
+        }
+
+        if( p_next == end - 3 && i_data < DATA_MAX )
+        {
+            p_next = end;
+        }
+
+        /* decode this nal */
+        i_size = p_next - p - 3;
+        if( i_size <= 0 )
+        {
+            if( b_eof )
+            {
+                break;
+            }
+            fprintf( stderr, "nal too large (FIXME) ?\n" );
+            i_data = 0;
+            continue;
+        }
+
+        x264_nal_decode( &nal, p +3, i_size );
+
+        /* decode the content of the nal */
+        x264_decoder_decode( h, &pic, &nal );
+
+        if( pic != NULL )
+        {
+            int i;
+
+            i_frame++;
+
+            for( i = 0; i < pic->i_plane;i++ )
+            {
+                int i_line;
+                int i_div;
+
+                i_div = i==0 ? 1 : 2;
+                for( i_line = 0; i_line < pic->i_height/i_div; i_line++ )
+                {
+                    fwrite( pic->plane[i]+i_line*pic->i_stride[i], 1, pic->i_width/i_div, fout );
+                }
+            }
+        }
+
+        memmove( &data[0], p_next, end - p_next );
+        i_data -= p_next - &data[0];
+    }
+
+    i_end = x264_mdate();
+    free( nal.p_payload );
+    fprintf( stderr, "\n" );
+
+    x264_decoder_close( h );
+
+    fclose( fh26l );
+    if( fout != stdout )
+    {
+        fclose( fout );
+    }
+    if( i_frame > 0 )
+    {
+        double fps = (double)i_frame * (double)1000000 /
+                     (double)( i_end - i_start );
+        fprintf( stderr, "decoded %d frames %ffps\n", i_frame, fps );
+    }
+#endif
+}
+
+/*****************************************************************************
+ * Encode:
+ *****************************************************************************/
+static int  Encode( x264_param_t  *param, FILE *fyuv, FILE *fout )
+{
+    x264_t *h;
+    x264_picture_t pic;
+
+    int     i_frame, i_frame_total;
+    int64_t i_start, i_end;
+    int64_t i_file;
+
+    i_frame_total = 0;
+    if( !fseek( fyuv, 0, SEEK_END ) )
+    {
+        int64_t i_size = ftell( fyuv );
+        fseek( fyuv, 0, SEEK_SET );
+        i_frame_total = (int)(i_size / ( param->i_width * param->i_height * 3 / 2 ));
+    }
+
+    if( ( h = x264_encoder_open( param ) ) == NULL )
+    {
+        fprintf( stderr, "x264_encoder_open failed\n" );
+        return -1;
+    }
+
+    /* Create a new pic */
+    x264_picture_alloc( &pic, X264_CSP_I420, param->i_width, param->i_height );
+
+    i_start = x264_mdate();
+    for( i_frame = 0, i_file = 0; i_ctrl_c == 0 ; i_frame++ )
+    {
+        int         i_nal;
+        x264_nal_t  *nal;
+
+        int         i;
+
+        /* read a frame */
+        if( fread( pic.img.plane[0], 1, param->i_width * param->i_height, fyuv ) <= 0 ||
+            fread( pic.img.plane[1], 1, param->i_width * param->i_height / 4, fyuv ) <= 0 ||
+            fread( pic.img.plane[2], 1, param->i_width * param->i_height / 4, fyuv ) <= 0 )
+        {
+            break;
+        }
+
+        /* Do not force any parameters */
+        pic.i_type = X264_TYPE_AUTO;
+        if( x264_encoder_encode( h, &nal, &i_nal, &pic ) < 0 )
+        {
+            fprintf( stderr, "x264_encoder_encode failed\n" );
+        }
+
+        for( i = 0; i < i_nal; i++ )
+        {
+            int i_size;
+            int i_data;
+
+            i_data = DATA_MAX;
+            if( ( i_size = x264_nal_encode( data, &i_data, 1, &nal[i] ) ) > 0 )
+            {
+                i_file += fwrite( data, 1, i_size, fout );
+            }
+            else if( i_size < 0 )
+            {
+                fprintf( stderr,
+                         "need to increase buffer size (size=%d)\n", -i_size );
+            }
+        }
+    }
+    i_end = x264_mdate();
+    x264_picture_clean( &pic );
+    x264_encoder_close( h );
+    fprintf( stderr, "\n" );
+
+    fclose( fyuv );
+    if( fout != stdout )
+    {
+        fclose( fout );
+    }
+
+    if( i_frame > 0 )
+    {
+        double fps = (double)i_frame * (double)1000000 /
+                     (double)( i_end - i_start );
+
+        fprintf( stderr, "encoded %d frames %ffps %lld kb/s\n", i_frame, fps, i_file * 8 * 25 / i_frame / 1000 );
+    }
+
+    return 0;
+}
+
+
diff --git a/x264.h b/x264.h
new file mode 100644
index 00000000..772c6c7f
--- /dev/null
+++ b/x264.h
@@ -0,0 +1,249 @@
+/*****************************************************************************
+ * x264.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: x264.h,v 1.1 2004/06/03 19:24:12 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _X264_H
+#define _X264_H 1
+
+#define X264_BUILD 0x0008
+
+/* x264_t:
+ *      opaque handler for decoder and encoder */
+typedef struct x264_t x264_t;
+
+/****************************************************************************
+ * Initialisation structure and function.
+ ****************************************************************************/
+/* CPU flags
+ */
+#define X264_CPU_MMX        0x000001    /* mmx */
+#define X264_CPU_MMXEXT     0x000002    /* mmx-ext*/
+#define X264_CPU_SSE        0x000004    /* sse */
+#define X264_CPU_SSE2       0x000008    /* sse 2 */
+#define X264_CPU_3DNOW      0x000010    /* 3dnow! */
+#define X264_CPU_3DNOWEXT   0x000020    /* 3dnow! ext */
+#define X264_CPU_ALTIVEC    0x000040    /* altivec */
+
+/* Analyse flags
+ */
+#define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
+#define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
+#define X264_ANALYSE_PSUB8x8    0x0020  /* Analyse p8x4, p4x8, p4x4 */
+
+/* Colorspace type
+ */
+#define X264_CSP_MASK           0x00ff  /* */
+#define X264_CSP_NONE           0x0000  /* Invalid mode     */
+#define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
+#define X264_CSP_I422           0x0002  /* yuv 4:2:2 planar */
+#define X264_CSP_I444           0x0003  /* yuv 4:4:4 planar */
+#define X264_CSP_YV12           0x0004  /* yuv 4:2:0 planar */
+#define X264_CSP_YUYV           0x0005  /* yuv 4:2:2 packed */
+#define X264_CSP_RGB            0x0006  /* rgb 24bits       */
+#define X264_CSP_BGR            0x0007  /* bgr 24bits       */
+#define X264_CSP_BGRA           0x0008  /* bgr 32bits       */
+#define X264_CSP_VFLIP          0x1000  /* */
+
+/* Slice type
+ */
+#define X264_TYPE_AUTO          0x0000  /* Let x264 choose the right type */
+#define X264_TYPE_IDR           0x0001
+#define X264_TYPE_I             0x0002
+#define X264_TYPE_P             0x0003
+#define X264_TYPE_B             0x0004
+
+typedef struct
+{
+    /* CPU flags */
+    unsigned int cpu;
+
+    /* Video Properties */
+    int         i_width;
+    int         i_height;
+    int         i_csp;  /* CSP of encoded bitstream, only i420 supported */
+
+    struct
+    {
+        /* they will be reduced to be 0 < x <= 65535 and prime */
+        int         i_sar_height;
+        int         i_sar_width;
+    } vui;
+
+    float       f_fps;  /* Used for rate control only */
+
+    /* Bitstream parameters */
+    int         i_frame_reference;  /* Maximum number of reference frames */
+    int         i_idrframe; /* every i_idrframe I frame are marked as IDR */
+    int         i_iframe;   /* every i_iframe are intra */
+    int         i_bframe;   /* how many b-frame between 2 references pictures */
+
+    int         b_deblocking_filter;
+    int         i_deblocking_filter_alphac0;    /* [-6, 6] -6 light filter, 6 strong */
+    int         i_deblocking_filter_beta;       /* [-6, 6]  idem */
+
+    int         b_cabac;
+    int         i_cabac_init_idc;
+
+    int         i_qp_constant;  /* 1-51 */
+    int         i_bitrate;      /* not working yet */
+
+    /* Encoder analyser parameters */
+    struct
+    {
+        unsigned int intra;     /* intra flags */
+        unsigned int inter;     /* inter flags */
+    } analyse;
+
+} x264_param_t;
+
+/* x264_param_default:
+ *      fill x264_param_t with default values and do CPU detection */
+void    x264_param_default( x264_param_t * );
+
+/****************************************************************************
+ * Picture structures and functions.
+ ****************************************************************************/
+typedef struct
+{
+    int     i_csp;
+
+    int     i_plane;
+    int     i_stride[4];
+    uint8_t *plane[4];
+} x264_image_t;
+
+typedef struct
+{
+    /* In: force picture type (if not auto) XXX: ignored for now
+     * Out: type of the picture encoded */
+    int     i_type;
+    /* In: force quantizer for > 0 */
+    int     i_qpplus1;
+    /* In: user pts, Out: pts of encoded picture (user)*/
+    int64_t i_pts;
+
+    /* In: raw data */
+    x264_image_t img;
+} x264_picture_t;
+
+/* x264_picture_alloc:
+ *  alloc data for a picture. You must call x264_picture_clean on it. */
+void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
+
+/* x264_picture_clean:
+ *  free associated resource for a x264_picture_t allocated with
+ *  x264_picture_alloc ONLY */
+void x264_picture_clean( x264_picture_t *pic );
+
+/****************************************************************************
+ * NAL structure and functions:
+ ****************************************************************************/
+/* nal */
+enum nal_unit_type_e
+{
+    NAL_UNKNOWN = 0,
+    NAL_SLICE   = 1,
+    NAL_SLICE_DPA   = 2,
+    NAL_SLICE_DPB   = 3,
+    NAL_SLICE_DPC   = 4,
+    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+    NAL_SEI         = 6,    /* ref_idc == 0 */
+    NAL_SPS         = 7,
+    NAL_PPS         = 8
+    /* ref_idc == 0 for 6,9,10,11,12 */
+};
+enum nal_priority_e
+{
+    NAL_PRIORITY_DISPOSABLE = 0,
+    NAL_PRIORITY_LOW        = 1,
+    NAL_PRIORITY_HIGH       = 2,
+    NAL_PRIORITY_HIGHEST    = 3,
+};
+
+typedef struct
+{
+    int i_ref_idc;  /* nal_priority_e */
+    int i_type;     /* nal_unit_type_e */
+
+    /* This data are raw payload */
+    int     i_payload;
+    uint8_t *p_payload;
+} x264_nal_t;
+
+/* x264_nal_encode:
+ *      encode a nal into a buffer, setting the size.
+ *      if b_annexeb then a long synch work is added
+ *      XXX: it currently doesn't check for overflow */
+int x264_nal_encode( void *, int *, int b_annexeb, x264_nal_t *nal );
+
+/* x264_nal_decode:
+ *      decode a buffer nal into a x264_nal_t */
+int x264_nal_decode( x264_nal_t *nal, void *, int );
+
+/****************************************************************************
+ * Encoder functions:
+ ****************************************************************************/
+
+/* x264_encoder_open:
+ *      create a new encoder handler, all parameters from x264_param_t are copied */
+x264_t *x264_encoder_open   ( x264_param_t * );
+/* x264_encoder_headers:
+ *      return the SPS and PPS that will be used for the whole stream */
+int     x264_encoder_headers( x264_t *, x264_nal_t **, int * );
+/* x264_encoder_encode:
+ *      encode one picture */
+int     x264_encoder_encode ( x264_t *, x264_nal_t **, int *, x264_picture_t * );
+/* x264_encoder_close:
+ *      close an encoder handler */
+void    x264_encoder_close  ( x264_t * );
+
+/* XXX: decoder isn't working so no need to export it */
+#if 0
+/****************************************************************************
+ * Decoder functions:
+ ****************************************************************************
+ * XXX: Not yet working so do not try ...
+ ****************************************************************************/
+/* x264_decoder_open:
+ */
+x264_t *x264_decoder_open   ( x264_param_t * );
+/* x264_decoder_decode:
+ */
+int     x264_decoder_decode ( x264_t *, x264_picture_t **, x264_nal_t * );
+/* x264_decoder_close:
+ */
+void    x264_decoder_close  ( x264_t * );
+#endif
+
+/****************************************************************************
+ * Private stuff for internal usage:
+ ****************************************************************************/
+#ifdef __X264__
+#   ifdef _MSC_VER
+#       define inline __inline
+#       define DECLARE_ALIGNED( type, var, n ) __declspec(align(n)) type var
+#   else
+#       define DECLARE_ALIGNED( type, var, n ) type var __attribute__((aligned(n)))
+#   endif
+#endif
+
+#endif