[disable HarfBuzz support @<:@default=check@:>@]))
AC_ARG_ENABLE([asm], AS_HELP_STRING([--disable-asm],
[disable compiling with ASM @<:@default=check@:>@]))
+AC_ARG_ENABLE([rasterizer], AS_HELP_STRING([--disable-rasterizer],
+ [disable internal rasterizer @<:@default=enabled@:>@]))
+AC_ARG_ENABLE([large-tiles], AS_HELP_STRING([--enable-large-tiles],
+ [use larger tiles in the rasterizer (better performance, slightly worse quality) @<:@default=disabled@:>@]))
AS_IF([test x$enable_asm != xno], [
AS_CASE([$host],
[AC_DEFINE(CONFIG_ASM, 0, [ASM enabled])]
)
+AM_CONDITIONAL([RASTERIZER], [test x$enable_rasterizer != xno])
+
+AM_COND_IF([RASTERIZER],
+ [AC_DEFINE(CONFIG_RASTERIZER, 1, [rasterizer enabled])],
+ [AC_DEFINE(CONFIG_RASTERIZER, 0, [rasterizer enabled])]
+ )
+
+AM_CONDITIONAL([ENABLE_LARGE_TILES], [test x$enable_large_tiles = xyes])
+
+AM_COND_IF([ENABLE_LARGE_TILES],
+ [AC_DEFINE(CONFIG_LARGE_TILES, 1, [use large tiles])]
+ [AC_DEFINE(CONFIG_LARGE_TILES, 0, [use small tiles])],
+ )
+
PKG_CHECK_MODULES([FREETYPE], freetype2 >= 9.10.3, [
CFLAGS="$CFLAGS $FREETYPE_CFLAGS"
LIBS="$LIBS $FREETYPE_LIBS"
SRC_INTEL = x86/blend_bitmaps.asm x86/cpuid.asm x86/blend_bitmaps.h x86/cpuid.h
SRC_INTEL64 = x86/be_blur.asm x86/be_blur.h
+SRC_INTEL_RASTERIZER = x86/rasterizer.asm x86/rasterizer.h
+
+SRC_RASTERIZER = ass_rasterizer.h ass_rasterizer.c ass_rasterizer_c.c
lib_LTLIBRARIES = libass.la
libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \
libass_la_LDFLAGS = -no-undefined -version-info $(LIBASS_LT_CURRENT):$(LIBASS_LT_REVISION):$(LIBASS_LT_AGE)
libass_la_LDFLAGS += -export-symbols $(srcdir)/libass.sym
+if RASTERIZER
+libass_la_SOURCES += $(SRC_RASTERIZER)
+endif
+
if ASM
if INTEL
libass_la_SOURCES += $(SRC_INTEL)
+if RASTERIZER
+libass_la_SOURCES += $(SRC_INTEL_RASTERIZER)
+endif
if X64
libass_la_SOURCES += $(SRC_INTEL64)
endif
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
+#include "config.h"
+
#include <stdlib.h>
#include <string.h>
#include <math.h>
return dst;
}
-Bitmap *outline_to_bitmap(ASS_Library *library, FT_Library ftlib,
+#if CONFIG_RASTERIZER
+
+Bitmap *outline_to_bitmap(ASS_Renderer *render_priv,
+ FT_Outline *outline, int bord)
+{
+ ASS_Rasterizer *rst = &render_priv->rasterizer;
+ if (!rasterizer_set_outline(rst, outline)) {
+ ass_msg(render_priv->library, MSGL_WARN, "Failed to process glyph outline!\n");
+ return NULL;
+ }
+
+ if (rst->x_min >= rst->x_max || rst->y_min >= rst->y_max) {
+ Bitmap *bm = alloc_bitmap(2 * bord, 2 * bord);
+ bm->left = bm->top = -bord;
+ return bm;
+ }
+
+ int x_min = rst->x_min >> 6;
+ int y_min = rst->y_min >> 6;
+ int x_max = (rst->x_max + 63) >> 6;
+ int y_max = (rst->y_max + 63) >> 6;
+ int w = x_max - x_min;
+ int h = y_max - y_min;
+
+ if (w * h > 8000000) {
+ ass_msg(render_priv->library, MSGL_WARN, "Glyph bounding box too large: %dx%dpx",
+ w, h);
+ return NULL;
+ }
+
+ int mask = (1 << rst->tile_order) - 1;
+ int tile_w = (w + 2 * bord + mask) & ~mask;
+ int tile_h = (h + 2 * bord + mask) & ~mask;
+ Bitmap *bm = alloc_bitmap(tile_w, tile_h);
+ bm->left = x_min - bord;
+ bm->top = -y_max - bord;
+
+ int offs = bord & ~mask;
+ int bord_h = tile_h - h - bord;
+ if (!rasterizer_fill(rst,
+ bm->buffer + offs * (bm->stride + 1),
+ x_min - bord + offs,
+ y_min - bord_h + (bord_h & ~mask),
+ ((w + bord + mask) & ~mask) - offs,
+ ((h + bord + mask) & ~mask) - offs,
+ bm->stride, 1)) {
+ ass_msg(render_priv->library, MSGL_WARN, "Failed to rasterize glyph!\n");
+ ass_free_bitmap(bm);
+ return NULL;
+ }
+
+ return bm;
+}
+
+#else
+
+Bitmap *outline_to_bitmap(ASS_Renderer *render_priv,
FT_Outline *outline, int bord)
{
Bitmap *bm;
bbox.yMax >>= 6;
if (w * h > 8000000) {
- ass_msg(library, MSGL_WARN, "Glyph bounding box too large: %dx%dpx",
+ ass_msg(render_priv->library, MSGL_WARN, "Glyph bounding box too large: %dx%dpx",
w, h);
return NULL;
}
bitmap.pixel_mode = FT_PIXEL_MODE_GRAY;
// render into target bitmap
- if ((error = FT_Outline_Get_Bitmap(ftlib, outline, &bitmap))) {
- ass_msg(library, MSGL_WARN, "Failed to rasterize glyph: %d\n", error);
+ if ((error = FT_Outline_Get_Bitmap(render_priv->ftlibrary, outline, &bitmap))) {
+ ass_msg(render_priv->library, MSGL_WARN, "Failed to rasterize glyph: %d\n", error);
ass_free_bitmap(bm);
return NULL;
}
return bm;
}
+#endif
+
/**
* \brief fix outline bitmap
*
}
}
-int outline_to_bitmap3(ASS_Library *library, ASS_SynthPriv *priv_blur,
- FT_Library ftlib, FT_Outline *outline, FT_Outline *border,
+int outline_to_bitmap3(ASS_Renderer *render_priv, FT_Outline *outline, FT_Outline *border,
Bitmap **bm_g, Bitmap **bm_o, Bitmap **bm_s,
int be, double blur_radius, FT_Vector shadow_offset,
int border_style, int border_visible)
*bm_g = *bm_o = *bm_s = 0;
if (outline)
- *bm_g = outline_to_bitmap(library, ftlib, outline, bord);
+ *bm_g = outline_to_bitmap(render_priv, outline, bord);
if (!*bm_g)
return 1;
if (border) {
- *bm_o = outline_to_bitmap(library, ftlib, border, bord);
+ *bm_o = outline_to_bitmap(render_priv, border, bord);
if (!*bm_o) {
return 1;
}
unsigned char *buffer; // h * stride buffer
} Bitmap;
-Bitmap *outline_to_bitmap(ASS_Library *library, FT_Library ftlib,
+Bitmap *outline_to_bitmap(ASS_Renderer *render_priv,
FT_Outline *outline, int bord);
Bitmap *alloc_bitmap(int w, int h);
* \param be 1 = produces blurred bitmaps, 0 = normal bitmaps
* \param border_visible whether border is visible if border_style is 3
*/
-int outline_to_bitmap3(ASS_Library *library, ASS_SynthPriv *priv_blur,
- FT_Library ftlib, FT_Outline *outline, FT_Outline *border,
+int outline_to_bitmap3(ASS_Renderer *render_priv, FT_Outline *outline, FT_Outline *border,
Bitmap **bm_g, Bitmap **bm_o, Bitmap **bm_s,
int be, double blur_radius, FT_Vector shadow_offset,
int border_style, int border_visible);
--- /dev/null
+/*
+ * Copyright (C) 2014 Vabishchevich Nikolay <vabnick@gmail.com>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "ass_utils.h"
+#include "ass_rasterizer.h"
+#include <assert.h>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+#endif
+
+
+
+static inline int ilog2(uint32_t n) // XXX: different compilers
+{
+#ifdef __GNUC__
+ return __builtin_clz(n) ^ 31;
+#elif defined(_MSC_VER)
+ int res;
+ _BitScanReverse(&res, n);
+ return res;
+#else
+ int res = 0;
+ for (int ord = 16; ord; ord /= 2)
+ if (n >= ((uint32_t)1 << ord)) {
+ res += ord;
+ n >>= ord;
+ }
+ return res;
+#endif
+}
+
+
+void rasterizer_init(ASS_Rasterizer *rst)
+{
+ rst->linebuf[0] = rst->linebuf[1] = NULL;
+ rst->size[0] = rst->capacity[0] = 0;
+ rst->size[1] = rst->capacity[1] = 0;
+}
+
+/**
+ * \brief Ensure sufficient buffer size (allocate if necessary)
+ * \param index index (0 or 1) of the input segment buffer (rst->linebuf)
+ * \param delta requested size increase
+ * \return zero on error
+ */
+static inline int check_capacity(ASS_Rasterizer *rst, int index, size_t delta)
+{
+ delta += rst->size[index];
+ if (rst->capacity[index] >= delta)
+ return 1;
+
+ size_t capacity = FFMAX(2 * rst->capacity[index], 64);
+ while (capacity < delta)
+ capacity *= 2;
+ void *ptr = realloc(rst->linebuf[index], sizeof(struct segment) * capacity);
+ if (!ptr)
+ return 0;
+
+ rst->linebuf[index] = (struct segment *)ptr;
+ rst->capacity[index] = capacity;
+ return 1;
+}
+
+void rasterizer_done(ASS_Rasterizer *rst)
+{
+ free(rst->linebuf[0]);
+ free(rst->linebuf[1]);
+}
+
+
+typedef struct {
+ int32_t x, y;
+} OutlinePoint;
+
+// Helper struct for spline split decision
+typedef struct {
+ OutlinePoint r;
+ int64_t r2, er;
+} OutlineSegment;
+
+static inline void segment_init(OutlineSegment *seg,
+ OutlinePoint beg, OutlinePoint end,
+ int32_t outline_error)
+{
+ int32_t x = end.x - beg.x;
+ int32_t y = end.y - beg.y;
+ int32_t abs_x = x < 0 ? -x : x;
+ int32_t abs_y = y < 0 ? -y : y;
+
+ seg->r.x = x;
+ seg->r.y = y;
+ seg->r2 = x * (int64_t)x + y * (int64_t)y;
+ seg->er = outline_error * (int64_t)FFMAX(abs_x, abs_y);
+}
+
+static inline int segment_subdivide(const OutlineSegment *seg,
+ OutlinePoint beg, OutlinePoint pt)
+{
+ int32_t x = pt.x - beg.x;
+ int32_t y = pt.y - beg.y;
+ int64_t pdr = seg->r.x * (int64_t)x + seg->r.y * (int64_t)y;
+ int64_t pcr = seg->r.x * (int64_t)y - seg->r.y * (int64_t)x;
+ return pdr < -seg->er || pdr > seg->r2 + seg->er ||
+ (pcr < 0 ? -pcr : pcr) > seg->er;
+}
+
+/**
+ * \brief Add new segment to polyline
+ */
+static inline int add_line(ASS_Rasterizer *rst, OutlinePoint pt0, OutlinePoint pt1)
+{
+ int32_t x = pt1.x - pt0.x;
+ int32_t y = pt1.y - pt0.y;
+ if (!x && !y)
+ return 1;
+
+ if (!check_capacity(rst, 0, 1))
+ return 0;
+ struct segment *line = rst->linebuf[0] + rst->size[0];
+ ++rst->size[0];
+
+ line->flags = SEGFLAG_EXACT_LEFT | SEGFLAG_EXACT_RIGHT |
+ SEGFLAG_EXACT_BOTTOM | SEGFLAG_EXACT_TOP;
+ if (x < 0)
+ line->flags ^= SEGFLAG_UR_DL;
+ if (y >= 0)
+ line->flags ^= SEGFLAG_UP | SEGFLAG_UR_DL;
+
+ line->x_min = FFMIN(pt0.x, pt1.x);
+ line->x_max = FFMAX(pt0.x, pt1.x);
+ line->y_min = FFMIN(pt0.y, pt1.y);
+ line->y_max = FFMAX(pt0.y, pt1.y);
+
+ line->a = y;
+ line->b = -x;
+ line->c = y * (int64_t)pt0.x - x * (int64_t)pt0.y;
+
+ // halfplane normalization
+ int32_t abs_x = x < 0 ? -x : x;
+ int32_t abs_y = y < 0 ? -y : y;
+ uint32_t max_ab = (abs_x > abs_y ? abs_x : abs_y);
+ int shift = 30 - ilog2(max_ab);
+ max_ab <<= shift + 1;
+ line->a <<= shift;
+ line->b <<= shift;
+ line->c <<= shift;
+ line->scale = (uint64_t)0x53333333 * (uint32_t)(max_ab * (uint64_t)max_ab >> 32) >> 32;
+ line->scale += 0x8810624D - (0xBBC6A7EF * (uint64_t)max_ab >> 32);
+ //line->scale = ((uint64_t)1 << 61) / max_ab;
+ return 1;
+}
+
+/**
+ * \brief Add quadratic spline to polyline
+ * Preforms recursive subdivision if necessary.
+ */
+static int add_quadratic(ASS_Rasterizer *rst,
+ OutlinePoint pt0, OutlinePoint pt1, OutlinePoint pt2)
+{
+ OutlineSegment seg;
+ segment_init(&seg, pt0, pt2, rst->outline_error);
+ if (!segment_subdivide(&seg, pt0, pt1))
+ return add_line(rst, pt0, pt2);
+
+ OutlinePoint p01, p12, c; // XXX: overflow?
+ p01.x = pt0.x + pt1.x;
+ p01.y = pt0.y + pt1.y;
+ p12.x = pt1.x + pt2.x;
+ p12.y = pt1.y + pt2.y;
+ c.x = (p01.x + p12.x + 2) >> 2;
+ c.y = (p01.y + p12.y + 2) >> 2;
+ p01.x >>= 1;
+ p01.y >>= 1;
+ p12.x >>= 1;
+ p12.y >>= 1;
+ return add_quadratic(rst, pt0, p01, c) && add_quadratic(rst, c, p12, pt2);
+}
+
+/**
+ * \brief Add cubic spline to polyline
+ * Preforms recursive subdivision if necessary.
+ */
+static int add_cubic(ASS_Rasterizer *rst,
+ OutlinePoint pt0, OutlinePoint pt1, OutlinePoint pt2, OutlinePoint pt3)
+{
+ OutlineSegment seg;
+ segment_init(&seg, pt0, pt3, rst->outline_error);
+ if (!segment_subdivide(&seg, pt0, pt1) && !segment_subdivide(&seg, pt0, pt2))
+ return add_line(rst, pt0, pt3);
+
+ OutlinePoint p01, p12, p23, p012, p123, c; // XXX: overflow?
+ p01.x = pt0.x + pt1.x;
+ p01.y = pt0.y + pt1.y;
+ p12.x = pt1.x + pt2.x + 2;
+ p12.y = pt1.y + pt2.y + 2;
+ p23.x = pt2.x + pt3.x;
+ p23.y = pt2.y + pt3.y;
+ p012.x = p01.x + p12.x;
+ p012.y = p01.y + p12.y;
+ p123.x = p12.x + p23.x;
+ p123.y = p12.y + p23.y;
+ c.x = (p012.x + p123.x - 1) >> 3;
+ c.y = (p012.y + p123.y - 1) >> 3;
+ p01.x >>= 1;
+ p01.y >>= 1;
+ p012.x >>= 2;
+ p012.y >>= 2;
+ p123.x >>= 2;
+ p123.y >>= 2;
+ p23.x >>= 1;
+ p23.y >>= 1;
+ return add_cubic(rst, pt0, p01, p012, c) && add_cubic(rst, c, p123, p23, pt3);
+}
+
+
+int rasterizer_set_outline(ASS_Rasterizer *rst, const FT_Outline *path)
+{
+ enum Status {
+ S_ON, S_Q, S_C1, S_C2
+ };
+
+ int i, j = 0;
+ rst->size[0] = 0;
+ for (i = 0; i < path->n_contours; ++i) {
+ OutlinePoint start, p[4];
+ int process_end = 1;
+ enum Status st;
+
+ int last = path->contours[i];
+ switch (FT_CURVE_TAG(path->tags[j])) {
+ case FT_CURVE_TAG_ON:
+ p[0].x = path->points[j].x;
+ p[0].y = path->points[j].y;
+ start = p[0];
+ st = S_ON;
+ break;
+
+ case FT_CURVE_TAG_CONIC:
+ switch (FT_CURVE_TAG(path->tags[last])) {
+ case FT_CURVE_TAG_ON:
+ p[0].x = path->points[last].x;
+ p[0].y = path->points[last].y;
+ p[1].x = path->points[j].x;
+ p[1].y = path->points[j].y;
+ process_end = 0;
+ st = S_Q;
+ break;
+
+ case FT_CURVE_TAG_CONIC:
+ p[1].x = path->points[j].x;
+ p[1].y = path->points[j].y;
+ p[0].x = (p[1].x + path->points[last].x) >> 1;
+ p[0].y = (p[1].y + path->points[last].y) >> 1;
+ start = p[0];
+ st = S_Q;
+ break;
+
+ default:
+ return 0;
+ }
+ break;
+
+ default:
+ return 0;
+ }
+
+ for (j++; j <= last; ++j)
+ switch (FT_CURVE_TAG(path->tags[j])) {
+ case FT_CURVE_TAG_ON:
+ switch (st) {
+ case S_ON:
+ p[1].x = path->points[j].x;
+ p[1].y = path->points[j].y;
+ if (!add_line(rst, p[0], p[1]))
+ return 0;
+ p[0] = p[1];
+ break;
+
+ case S_Q:
+ p[2].x = path->points[j].x;
+ p[2].y = path->points[j].y;
+ if (!add_quadratic(rst, p[0], p[1], p[2]))
+ return 0;
+ p[0] = p[2];
+ st = S_ON;
+ break;
+
+ case S_C2:
+ p[3].x = path->points[j].x;
+ p[3].y = path->points[j].y;
+ if (!add_cubic(rst, p[0], p[1], p[2], p[3]))
+ return 0;
+ p[0] = p[3];
+ st = S_ON;
+ break;
+
+ default:
+ return 0;
+ }
+ break;
+
+ case FT_CURVE_TAG_CONIC:
+ switch (st) {
+ case S_ON:
+ p[1].x = path->points[j].x;
+ p[1].y = path->points[j].y;
+ st = S_Q;
+ break;
+
+ case S_Q:
+ p[3].x = path->points[j].x;
+ p[3].y = path->points[j].y;
+ p[2].x = (p[1].x + p[3].x) >> 1;
+ p[2].y = (p[1].y + p[3].y) >> 1;
+ if (!add_quadratic(rst, p[0], p[1], p[2]))
+ return 0;
+ p[0] = p[2];
+ p[1] = p[3];
+ break;
+
+ default:
+ return 0;
+ }
+ break;
+
+ case FT_CURVE_TAG_CUBIC:
+ switch (st) {
+ case S_ON:
+ p[1].x = path->points[j].x;
+ p[1].y = path->points[j].y;
+ st = S_C1;
+ break;
+
+ case S_C1:
+ p[2].x = path->points[j].x;
+ p[2].y = path->points[j].y;
+ st = S_C2;
+ break;
+
+ default:
+ return 0;
+ }
+ break;
+
+ default:
+ return 0;
+ }
+
+ if (process_end)
+ switch (st) {
+ case S_ON:
+ if (!add_line(rst, p[0], start))
+ return 0;
+ break;
+
+ case S_Q:
+ if (!add_quadratic(rst, p[0], p[1], start))
+ return 0;
+ break;
+
+ case S_C2:
+ if (!add_cubic(rst, p[0], p[1], p[2], start))
+ return 0;
+ break;
+
+ default:
+ return 0;
+ }
+ }
+
+ size_t k;
+ rst->x_min = rst->y_min = 0x7FFFFFFF;
+ rst->x_max = rst->y_max = 0x80000000;
+ for (k = 0; k < rst->size[0]; ++k) {
+ rst->x_min = FFMIN(rst->x_min, rst->linebuf[0][k].x_min);
+ rst->x_max = FFMAX(rst->x_max, rst->linebuf[0][k].x_max);
+ rst->y_min = FFMIN(rst->y_min, rst->linebuf[0][k].y_min);
+ rst->y_max = FFMAX(rst->y_max, rst->linebuf[0][k].y_max);
+ }
+ return 1;
+}
+
+
+static void segment_move_x(struct segment *line, int32_t x)
+{
+ line->x_min -= x;
+ line->x_max -= x;
+ line->x_min = FFMAX(line->x_min, 0);
+ line->c -= line->a * (int64_t)x;
+
+ static const int test = SEGFLAG_EXACT_LEFT | SEGFLAG_UR_DL;
+ if (!line->x_min && (line->flags & test) == test)
+ line->flags &= ~SEGFLAG_EXACT_BOTTOM;
+}
+
+static void segment_move_y(struct segment *line, int32_t y)
+{
+ line->y_min -= y;
+ line->y_max -= y;
+ line->y_min = FFMAX(line->y_min, 0);
+ line->c -= line->b * (int64_t)y;
+
+ static const int test = SEGFLAG_EXACT_BOTTOM | SEGFLAG_UR_DL;
+ if (!line->y_min && (line->flags & test) == test)
+ line->flags &= ~SEGFLAG_EXACT_LEFT;
+}
+
+static void segment_split_horz(struct segment *line, struct segment *next, int32_t x)
+{
+ assert(x > line->x_min && x < line->x_max);
+
+ *next = *line;
+ next->c -= line->a * (int64_t)x;
+ next->x_min = 0;
+ next->x_max -= x;
+ line->x_max = x;
+
+ line->flags &= ~SEGFLAG_EXACT_BOTTOM;
+ next->flags &= ~SEGFLAG_EXACT_TOP;
+ if (line->flags & SEGFLAG_UR_DL) {
+ int32_t tmp = line->flags;
+ line->flags = next->flags;
+ next->flags = tmp;
+ }
+ line->flags |= SEGFLAG_EXACT_RIGHT;
+ next->flags |= SEGFLAG_EXACT_LEFT;
+}
+
+static void segment_split_vert(struct segment *line, struct segment *next, int32_t y)
+{
+ assert(y > line->y_min && y < line->y_max);
+
+ *next = *line;
+ next->c -= line->b * (int64_t)y;
+ next->y_min = 0;
+ next->y_max -= y;
+ line->y_max = y;
+
+ line->flags &= ~SEGFLAG_EXACT_LEFT;
+ next->flags &= ~SEGFLAG_EXACT_RIGHT;
+ if (line->flags & SEGFLAG_UR_DL) {
+ int32_t tmp = line->flags;
+ line->flags = next->flags;
+ next->flags = tmp;
+ }
+ line->flags |= SEGFLAG_EXACT_TOP;
+ next->flags |= SEGFLAG_EXACT_BOTTOM;
+}
+
+static inline int segment_check_right(const struct segment *line, int32_t x)
+{
+ if (line->flags & SEGFLAG_EXACT_RIGHT)
+ return line->x_max <= x;
+ int64_t cc = line->c - line->a * (int64_t)x -
+ line->b * (int64_t)(line->flags & SEGFLAG_UR_DL ? line->y_max : line->y_min);
+ if (line->a > 0)
+ cc = -cc;
+ return cc >= 0;
+}
+
+static inline int segment_check_left(const struct segment *line, int32_t x)
+{
+ if (line->flags & SEGFLAG_EXACT_LEFT)
+ return line->x_min >= x;
+ int64_t cc = line->c - line->a * (int64_t)x -
+ line->b * (int64_t)(line->flags & SEGFLAG_UR_DL ? line->y_min : line->y_max);
+ if (line->a < 0)
+ cc = -cc;
+ return cc >= 0;
+}
+
+static inline int segment_check_top(const struct segment *line, int32_t y)
+{
+ if (line->flags & SEGFLAG_EXACT_TOP)
+ return line->y_max <= y;
+ int64_t cc = line->c - line->b * (int64_t)y -
+ line->a * (int64_t)(line->flags & SEGFLAG_UR_DL ? line->x_max : line->x_min);
+ if (line->b > 0)
+ cc = -cc;
+ return cc >= 0;
+}
+
+static inline int segment_check_bottom(const struct segment *line, int32_t y)
+{
+ if (line->flags & SEGFLAG_EXACT_BOTTOM)
+ return line->y_min >= y;
+ int64_t cc = line->c - line->b * (int64_t)y -
+ line->a * (int64_t)(line->flags & SEGFLAG_UR_DL ? line->x_min : line->x_max);
+ if (line->b < 0)
+ cc = -cc;
+ return cc >= 0;
+}
+
+/**
+ * \brief Split list of segments horizontally
+ * \param src in: input array, can coincide with *dst0 or *dst1
+ * \param n_src in: input array size
+ * \param dst0, dst1 out: pointers to output arrays of at least n_src size
+ * \param x in: split coordinate
+ * \return winding difference between bottom-split and bottom-left points
+ */
+static int polyline_split_horz(const struct segment *src, size_t n_src,
+ struct segment **dst0, struct segment **dst1, int32_t x)
+{
+ int winding = 0;
+ const struct segment *end = src + n_src;
+ for (; src != end; ++src) {
+ int delta = 0;
+ if (!src->y_min && (src->flags & SEGFLAG_EXACT_BOTTOM))
+ delta = src->a < 0 ? 1 : -1;
+ if (segment_check_right(src, x)) {
+ winding += delta;
+ if (src->x_min >= x)
+ continue;
+ **dst0 = *src;
+ (*dst0)->x_max = FFMIN((*dst0)->x_max, x);
+ ++(*dst0);
+ continue;
+ }
+ if (segment_check_left(src, x)) {
+ **dst1 = *src;
+ segment_move_x(*dst1, x);
+ ++(*dst1);
+ continue;
+ }
+ if (src->flags & SEGFLAG_UR_DL)
+ winding += delta;
+ **dst0 = *src;
+ segment_split_horz(*dst0, *dst1, x);
+ ++(*dst0);
+ ++(*dst1);
+ }
+ return winding;
+}
+
+/**
+ * \brief Split list of segments vertically
+ */
+static int polyline_split_vert(const struct segment *src, size_t n_src,
+ struct segment **dst0, struct segment **dst1, int32_t y)
+{
+ int winding = 0;
+ const struct segment *end = src + n_src;
+ for (; src != end; ++src) {
+ int delta = 0;
+ if (!src->x_min && (src->flags & SEGFLAG_EXACT_LEFT))
+ delta = src->b < 0 ? 1 : -1;
+ if (segment_check_top(src, y)) {
+ winding += delta;
+ if (src->y_min >= y)
+ continue;
+ **dst0 = *src;
+ (*dst0)->y_max = (*dst0)->y_max < y ? (*dst0)->y_max : y;
+ ++(*dst0);
+ continue;
+ }
+ if (segment_check_bottom(src, y)) {
+ **dst1 = *src;
+ segment_move_y(*dst1, y);
+ ++(*dst1);
+ continue;
+ }
+ if (src->flags & SEGFLAG_UR_DL)
+ winding += delta;
+ **dst0 = *src;
+ segment_split_vert(*dst0, *dst1, y);
+ ++(*dst0);
+ ++(*dst1);
+ }
+ return winding;
+}
+
+
+static inline void rasterizer_fill_solid(ASS_Rasterizer *rst,
+ uint8_t *buf, int width, int height, ptrdiff_t stride)
+{
+ assert(!(width & ((1 << rst->tile_order) - 1)));
+ assert(!(height & ((1 << rst->tile_order) - 1)));
+
+ int i, j;
+ ptrdiff_t step = 1 << rst->tile_order;
+ ptrdiff_t tile_stride = stride << rst->tile_order;
+ width >>= rst->tile_order;
+ height >>= rst->tile_order;
+ for (j = 0; j < height; ++j) {
+ for (i = 0; i < width; ++i)
+ rst->fill_solid(buf + i * step, stride);
+ buf += tile_stride;
+ }
+}
+
+static inline void rasterizer_fill_halfplane(ASS_Rasterizer *rst,
+ uint8_t *buf, int width, int height, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale)
+{
+ assert(!(width & ((1 << rst->tile_order) - 1)));
+ assert(!(height & ((1 << rst->tile_order) - 1)));
+ if (width == 1 << rst->tile_order && height == 1 << rst->tile_order) {
+ rst->fill_halfplane(buf, stride, a, b, c, scale);
+ return;
+ }
+
+ uint32_t abs_a = a < 0 ? -a : a;
+ uint32_t abs_b = b < 0 ? -b : b;
+ int64_t size = (int64_t)(abs_a + abs_b) << (rst->tile_order + 5);
+ int64_t offs = ((int64_t)a + b) << (rst->tile_order + 5);
+
+ int i, j;
+ ptrdiff_t step = 1 << rst->tile_order;
+ ptrdiff_t tile_stride = stride << rst->tile_order;
+ width >>= rst->tile_order;
+ height >>= rst->tile_order;
+ for (j = 0; j < height; ++j) {
+ for (i = 0; i < width; ++i) {
+ int64_t cc = c - ((a * (int64_t)i + b * (int64_t)j) << (rst->tile_order + 6));
+ int64_t offs_c = offs - cc;
+ int64_t abs_c = offs_c < 0 ? -offs_c : offs_c;
+ if (abs_c < size)
+ rst->fill_halfplane(buf + i * step, stride, a, b, cc, scale);
+ else if (((int32_t)(offs_c >> 32) ^ scale) & (1 << 31))
+ rst->fill_solid(buf + i * step, stride);
+ }
+ buf += tile_stride;
+ }
+}
+
+/**
+ * \brief Main quad-tree filling function
+ * \param index index (0 or 1) of the input segment buffer (rst->linebuf)
+ * \param offs current offset from the beginning of the buffer
+ * \param winding bottom-left winding value
+ * \return zero on error
+ * Rasterizes (possibly recursive) one quad-tree level.
+ * Truncates used input buffer.
+ */
+static int rasterizer_fill_level(ASS_Rasterizer *rst,
+ uint8_t *buf, int width, int height, ptrdiff_t stride, int index, size_t offs, int winding)
+{
+ assert(width > 0 && height > 0);
+ assert((unsigned)index < 2u && offs <= rst->size[index]);
+ assert(!(width & ((1 << rst->tile_order) - 1)));
+ assert(!(height & ((1 << rst->tile_order) - 1)));
+
+ size_t n = rst->size[index] - offs;
+ struct segment *line = rst->linebuf[index] + offs;
+ if (!n) {
+ if (winding)
+ rasterizer_fill_solid(rst, buf, width, height, stride);
+ return 1;
+ }
+ if (n == 1) {
+ int flag = 0;
+ if (line->c < 0)winding++;
+ if (winding)
+ flag ^= 1;
+ if (winding - 1)
+ flag ^= 3;
+ if (flag & 1)
+ rasterizer_fill_halfplane(rst, buf, width, height, stride,
+ line->a, line->b, line->c,
+ flag & 2 ? -line->scale : line->scale);
+ else if (flag & 2)
+ rasterizer_fill_solid(rst, buf, width, height, stride);
+ rst->size[index] = offs;
+ return 1;
+ }
+ if (width == 1 << rst->tile_order && height == 1 << rst->tile_order) {
+ rst->fill_generic(buf, stride, line, rst->size[index] - offs, winding);
+ rst->size[index] = offs;
+ return 1;
+ }
+
+ size_t offs1 = rst->size[index ^ 1];
+ if (!check_capacity(rst, index ^ 1, n))
+ return 0;
+ struct segment *dst0 = line;
+ struct segment *dst1 = rst->linebuf[index ^ 1] + offs1;
+
+ int winding1 = winding;
+ uint8_t *buf1 = buf;
+ int width1 = width;
+ int height1 = height;
+ if (width > height) {
+ width = 1 << ilog2(width - 1);
+ width1 -= width;
+ buf1 += width;
+ winding1 += polyline_split_horz(line, n, &dst0, &dst1, (int32_t)width << 6);
+ } else {
+ height = 1 << ilog2(height - 1);
+ height1 -= height;
+ buf1 += height * stride;
+ winding1 += polyline_split_vert(line, n, &dst0, &dst1, (int32_t)height << 6);
+ }
+ rst->size[index ^ 0] = dst0 - rst->linebuf[index ^ 0];
+ rst->size[index ^ 1] = dst1 - rst->linebuf[index ^ 1];
+
+ if (!rasterizer_fill_level(rst, buf, width, height, stride, index ^ 0, offs, winding))
+ return 0;
+ assert(rst->size[index ^ 0] == offs);
+ if (!rasterizer_fill_level(rst, buf1, width1, height1, stride, index ^ 1, offs1, winding1))
+ return 0;
+ assert(rst->size[index ^ 1] == offs1);
+ return 1;
+}
+
+int rasterizer_fill(ASS_Rasterizer *rst,
+ uint8_t *buf, int x0, int y0, int width, int height, ptrdiff_t stride,
+ int vert_flip)
+{
+ assert(width > 0 && height > 0);
+ assert(!(width & ((1 << rst->tile_order) - 1)));
+ assert(!(height & ((1 << rst->tile_order) - 1)));
+ x0 <<= 6; y0 <<= 6;
+
+ if (vert_flip) {
+ buf += (height - 1) * stride;
+ stride = -stride;
+ }
+
+ size_t n = rst->size[0];
+ struct segment *line = rst->linebuf[0];
+ struct segment *end = line + n;
+ for (; line != end; ++line) {
+ line->x_min -= x0;
+ line->x_max -= x0;
+ line->y_min -= y0;
+ line->y_max -= y0;
+ line->c -= line->a * (int64_t)x0 + line->b * (int64_t)y0;
+ }
+ rst->x_min -= x0;
+ rst->x_max -= x0;
+ rst->y_min -= y0;
+ rst->y_max -= y0;
+
+ int index = 0;
+ int winding = 0;
+ if (!check_capacity(rst, 1, rst->size[0]))
+ return 0;
+ int32_t size_x = (int32_t)width << 6;
+ int32_t size_y = (int32_t)height << 6;
+ if (rst->x_max >= size_x) {
+ struct segment *dst0 = rst->linebuf[index];
+ struct segment *dst1 = rst->linebuf[index ^ 1];
+ polyline_split_horz(rst->linebuf[index], n, &dst0, &dst1, size_x);
+ n = dst0 - rst->linebuf[index];
+ }
+ if (rst->y_max >= size_y) {
+ struct segment *dst0 = rst->linebuf[index];
+ struct segment *dst1 = rst->linebuf[index ^ 1];
+ polyline_split_vert(rst->linebuf[index], n, &dst0, &dst1, size_y);
+ n = dst0 - rst->linebuf[index];
+ }
+ if (rst->x_min <= 0) {
+ struct segment *dst0 = rst->linebuf[index];
+ struct segment *dst1 = rst->linebuf[index ^ 1];
+ polyline_split_horz(rst->linebuf[index], n, &dst0, &dst1, 0);
+ index ^= 1;
+ n = dst1 - rst->linebuf[index];
+ }
+ if (rst->y_min <= 0) {
+ struct segment *dst0 = rst->linebuf[index];
+ struct segment *dst1 = rst->linebuf[index ^ 1];
+ winding = polyline_split_vert(rst->linebuf[index], n, &dst0, &dst1, 0);
+ index ^= 1;
+ n = dst1 - rst->linebuf[index];
+ }
+ rst->size[index] = n;
+ rst->size[index ^ 1] = 0;
+ return rasterizer_fill_level(rst, buf, width, height, stride,
+ index, 0, winding);
+}
--- /dev/null
+/*
+ * Copyright (C) 2014 Vabishchevich Nikolay <vabnick@gmail.com>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef LIBASS_RASTERIZER_H
+#define LIBASS_RASTERIZER_H
+
+#include <ft2build.h>
+#include FT_FREETYPE_H
+#include <stddef.h>
+#include <stdint.h>
+
+
+enum {
+ SEGFLAG_UP = 1,
+ SEGFLAG_UR_DL = 2,
+ SEGFLAG_EXACT_LEFT = 4,
+ SEGFLAG_EXACT_RIGHT = 8,
+ SEGFLAG_EXACT_BOTTOM = 16,
+ SEGFLAG_EXACT_TOP = 32
+};
+
+// Polyline segment struct
+struct segment {
+ int64_t c;
+ int32_t a, b, scale, flags;
+ int32_t x_min, x_max, y_min, y_max;
+};
+
+
+typedef void (*FillSolidTileFunc)(uint8_t *buf, ptrdiff_t stride);
+typedef void (*FillHalfplaneTileFunc)(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale);
+typedef void (*FillGenericTileFunc)(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding);
+
+void ass_fill_solid_tile16_c(uint8_t *buf, ptrdiff_t stride);
+void ass_fill_solid_tile32_c(uint8_t *buf, ptrdiff_t stride);
+void ass_fill_halfplane_tile16_c(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale);
+void ass_fill_halfplane_tile32_c(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale);
+void ass_fill_generic_tile16_c(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding);
+void ass_fill_generic_tile32_c(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding);
+
+typedef struct ass_rasterizer {
+ int outline_error; // acceptable error (in 1/64 pixel units)
+
+ int tile_order; // log2(tile_size)
+ FillSolidTileFunc fill_solid;
+ FillHalfplaneTileFunc fill_halfplane;
+ FillGenericTileFunc fill_generic;
+
+ int32_t x_min, x_max, y_min, y_max; // usable after rasterizer_set_outline
+
+ // internal buffers
+ struct segment *linebuf[2];
+ size_t size[2], capacity[2];
+} ASS_Rasterizer;
+
+void rasterizer_init(ASS_Rasterizer *rst);
+void rasterizer_done(ASS_Rasterizer *rst);
+/**
+ * \brief Convert FreeType outline to polyline and calculate exact bounds
+ */
+int rasterizer_set_outline(ASS_Rasterizer *rst, const FT_Outline *path);
+/**
+ * \brief Polyline rasterization function
+ * \param x0, y0, width, height in: source window (full pixel units)
+ * \param buf out: aligned output buffer (size = stride * height)
+ * \param stride output buffer stride (aligned)
+ * \param vert_flip vertical flip flag
+ * \return zero on error
+ * Deletes preprocessed polyline after work.
+ */
+int rasterizer_fill(ASS_Rasterizer *rst, uint8_t *buf, int x0, int y0,
+ int width, int height, ptrdiff_t stride, int vert_flip);
+
+
+#endif /* LIBASS_RASTERIZER_H */
--- /dev/null
+/*
+ * Copyright (C) 2014 Vabishchevich Nikolay <vabnick@gmail.com>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "ass_utils.h"
+#include "ass_rasterizer.h"
+#include <assert.h>
+
+
+
+void ass_fill_solid_tile16_c(uint8_t *buf, ptrdiff_t stride)
+{
+ int i, j;
+ int8_t value = 255;
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ buf[i] = value;
+ buf += stride;
+ }
+}
+
+void ass_fill_solid_tile32_c(uint8_t *buf, ptrdiff_t stride)
+{
+ int i, j;
+ int8_t value = 255;
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i)
+ buf[i] = value;
+ buf += stride;
+ }
+}
+
+
+/*
+ * Halfplane Filling Functions
+ *
+ * Fill pixels with antialiasing corresponding to equation
+ * A * x + B * y < C, where
+ * x, y - offset of pixel center from bottom-left,
+ * A = a * scale, B = b * scale, C = c * scale / 64.
+ *
+ * Normalization of coefficients prior call:
+ * max(abs(a), abs(b)) * scale = 1 << 61
+ *
+ * Used Algorithm
+ * Let
+ * max_ab = max(abs(A), abs(B)),
+ * min_ab = min(abs(A), abs(B)),
+ * CC = C - A * x - B * y, then
+ * result = (clamp((CC - min_ab / 4) / max_ab) +
+ * clamp((CC + min_ab / 4) / max_ab) +
+ * 1) / 2,
+ * where clamp(Z) = max(-0.5, min(0.5, Z)).
+ */
+
+void ass_fill_halfplane_tile16_c(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale)
+{
+ int16_t aa = (a * (int64_t)scale + ((int64_t)1 << 49)) >> 50;
+ int16_t bb = (b * (int64_t)scale + ((int64_t)1 << 49)) >> 50;
+ int16_t cc = ((int32_t)(c >> 11) * (int64_t)scale + ((int64_t)1 << 44)) >> 45;
+ cc += (1 << 9) - ((aa + bb) >> 1);
+
+ int16_t abs_a = aa < 0 ? -aa : aa;
+ int16_t abs_b = bb < 0 ? -bb : bb;
+ int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2;
+
+ int i, j;
+ int16_t va1[16], va2[16];
+ for (i = 0; i < 16; ++i) {
+ va1[i] = aa * i - delta;
+ va2[i] = aa * i + delta;
+ }
+
+ static const int16_t full = (1 << 10) - 1;
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i) {
+ int16_t c1 = cc - va1[i];
+ int16_t c2 = cc - va2[i];
+ c1 = FFMINMAX(c1, 0, full);
+ c2 = FFMINMAX(c2, 0, full);
+ buf[i] = (c1 + c2) >> 3;
+ }
+ buf += stride;
+ cc -= bb;
+ }
+}
+
+void ass_fill_halfplane_tile32_c(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale)
+{
+ int16_t aa = (a * (int64_t)scale + ((int64_t)1 << 50)) >> 51;
+ int16_t bb = (b * (int64_t)scale + ((int64_t)1 << 50)) >> 51;
+ int16_t cc = ((int32_t)(c >> 12) * (int64_t)scale + ((int64_t)1 << 44)) >> 45;
+ cc += (1 << 8) - ((aa + bb) >> 1);
+
+ int16_t abs_a = aa < 0 ? -aa : aa;
+ int16_t abs_b = bb < 0 ? -bb : bb;
+ int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2;
+
+ int i, j;
+ int16_t va1[32], va2[32];
+ for (i = 0; i < 32; ++i) {
+ va1[i] = aa * i - delta;
+ va2[i] = aa * i + delta;
+ }
+
+ static const int16_t full = (1 << 9) - 1;
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i) {
+ int16_t c1 = cc - va1[i];
+ int16_t c2 = cc - va2[i];
+ c1 = FFMINMAX(c1, 0, full);
+ c2 = FFMINMAX(c2, 0, full);
+ buf[i] = (c1 + c2) >> 2;
+ }
+ buf += stride;
+ cc -= bb;
+ }
+}
+
+
+/*
+ * Generic Filling Functions
+ *
+ * Used Algorithm
+ * Construct trapezium from each polyline segment and its projection into left side of tile.
+ * Render that trapezium into internal buffer with additive blending and correct sign.
+ * Store clamped absolute value from internal buffer into result buffer.
+ */
+
+// Render top/bottom line of the trapezium with antialiasing
+static inline void update_border_line16(int16_t res[16],
+ int16_t abs_a, const int16_t va[16],
+ int16_t b, int16_t abs_b,
+ int16_t c, int dn, int up)
+{
+ int16_t size = up - dn;
+ int16_t w = (1 << 10) + (size << 4) - abs_a;
+ w = FFMIN(w, 1 << 10) << 3;
+
+ int16_t dc_b = abs_b * (int32_t)size >> 6;
+ int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2;
+
+ int16_t base = (int32_t)b * (int16_t)(dn + up) >> 7;
+ int16_t offs1 = size - ((base + dc) * (int32_t)w >> 16);
+ int16_t offs2 = size - ((base - dc) * (int32_t)w >> 16);
+
+ int i;
+ size <<= 1;
+ for (i = 0; i < 16; ++i) {
+ int16_t cw = (c - va[i]) * (int32_t)w >> 16;
+ int16_t c1 = cw + offs1;
+ int16_t c2 = cw + offs2;
+ c1 = FFMINMAX(c1, 0, size);
+ c2 = FFMINMAX(c2, 0, size);
+ res[i] += c1 + c2;
+ }
+}
+
+void ass_fill_generic_tile16_c(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding)
+{
+ int i, j;
+ int16_t res[16][16], delta[18];
+ for (j = 0; j < 16; ++j)
+ for (i = 0; i < 16; ++i)
+ res[j][i] = 0;
+ for (j = 0; j < 18; ++j)
+ delta[j] = 0;
+
+ static const int16_t full = 1 << 10;
+ const struct segment *end = line + n_lines;
+ for (; line != end; ++line) {
+ assert(line->y_min >= 0 && line->y_min < 1 << 10);
+ assert(line->y_max > 0 && line->y_max <= 1 << 10);
+ assert(line->y_min <= line->y_max);
+
+ int16_t dn_delta = line->flags & SEGFLAG_UP ? 4 : 0;
+ int16_t up_delta = dn_delta;
+ if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT))up_delta ^= 4;
+ if (line->flags & SEGFLAG_UR_DL) {
+ int16_t tmp = dn_delta;
+ dn_delta = up_delta;
+ up_delta = tmp;
+ }
+
+ int dn = line->y_min >> 6, up = line->y_max >> 6;
+ int16_t dn_pos = line->y_min & 63;
+ int16_t dn_delta1 = dn_delta * dn_pos;
+ int16_t up_pos = line->y_max & 63;
+ int16_t up_delta1 = up_delta * up_pos;
+ delta[dn + 1] -= dn_delta1;
+ delta[dn] -= (dn_delta << 6) - dn_delta1;
+ delta[up + 1] += up_delta1;
+ delta[up] += (up_delta << 6) - up_delta1;
+ if (line->y_min == line->y_max)
+ continue;
+
+ int16_t a = (line->a * (int64_t)line->scale + ((int64_t)1 << 49)) >> 50;
+ int16_t b = (line->b * (int64_t)line->scale + ((int64_t)1 << 49)) >> 50;
+ int16_t c = ((int32_t)(line->c >> 11) * (int64_t)line->scale + ((int64_t)1 << 44)) >> 45;
+ c -= (a >> 1) + b * dn;
+
+ int16_t va[16];
+ for (i = 0; i < 16; ++i)
+ va[i] = a * i;
+ int16_t abs_a = a < 0 ? -a : a;
+ int16_t abs_b = b < 0 ? -b : b;
+ int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2;
+ int16_t base = (1 << 9) - (b >> 1);
+ int16_t dc1 = base + dc;
+ int16_t dc2 = base - dc;
+
+ if (dn_pos) {
+ if (up == dn) {
+ update_border_line16(res[dn], abs_a, va, b, abs_b, c, dn_pos, up_pos);
+ continue;
+ }
+ update_border_line16(res[dn], abs_a, va, b, abs_b, c, dn_pos, 64);
+ dn++;
+ c -= b;
+ }
+ for (j = dn; j < up; ++j) {
+ for (i = 0; i < 16; ++i) {
+ int16_t c1 = c - va[i] + dc1;
+ int16_t c2 = c - va[i] + dc2;
+ c1 = FFMINMAX(c1, 0, full);
+ c2 = FFMINMAX(c2, 0, full);
+ res[j][i] += (c1 + c2) >> 3;
+ }
+ c -= b;
+ }
+ if (up_pos)
+ update_border_line16(res[up], abs_a, va, b, abs_b, c, 0, up_pos);
+ }
+
+ int16_t cur = winding << 8;
+ for (j = 0; j < 16; ++j) {
+ cur += delta[j];
+ for (i = 0; i < 16; ++i) {
+ int16_t val = res[j][i] + cur, neg_val = -val;
+ val = (val > neg_val ? val : neg_val);
+ buf[i] = FFMIN(val, 255);
+ }
+ buf += stride;
+ }
+}
+
+// Render top/bottom line of the trapezium with antialiasing
+static inline void update_border_line32(int16_t res[32],
+ int16_t abs_a, const int16_t va[32],
+ int16_t b, int16_t abs_b,
+ int16_t c, int dn, int up)
+{
+ int16_t size = up - dn;
+ int16_t w = (1 << 9) + (size << 3) - abs_a;
+ w = FFMIN(w, 1 << 9) << 5;
+
+ int16_t dc_b = abs_b * (int32_t)size >> 6;
+ int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2;
+
+ int16_t base = (int32_t)b * (int16_t)(dn + up) >> 7;
+ int16_t offs1 = size - ((base + dc) * (int32_t)w >> 16);
+ int16_t offs2 = size - ((base - dc) * (int32_t)w >> 16);
+
+ int i;
+ size <<= 1;
+ for (i = 0; i < 32; ++i) {
+ int16_t cw = (c - va[i]) * (int32_t)w >> 16;
+ int16_t c1 = cw + offs1;
+ int16_t c2 = cw + offs2;
+ c1 = FFMINMAX(c1, 0, size);
+ c2 = FFMINMAX(c2, 0, size);
+ res[i] += c1 + c2;
+ }
+}
+
+void ass_fill_generic_tile32_c(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding)
+{
+ int i, j;
+ int16_t res[32][32], delta[34];
+ for (j = 0; j < 32; ++j)
+ for (i = 0; i < 32; ++i)
+ res[j][i] = 0;
+ for (j = 0; j < 34; ++j)
+ delta[j] = 0;
+
+ static const int16_t full = 1 << 9;
+ const struct segment *end = line + n_lines;
+ for (; line != end; ++line) {
+ assert(line->y_min >= 0 && line->y_min < 1 << 11);
+ assert(line->y_max > 0 && line->y_max <= 1 << 11);
+ assert(line->y_min <= line->y_max);
+
+ int16_t dn_delta = line->flags & SEGFLAG_UP ? 4 : 0;
+ int16_t up_delta = dn_delta;
+ if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT))up_delta ^= 4;
+ if (line->flags & SEGFLAG_UR_DL) {
+ int16_t tmp = dn_delta;
+ dn_delta = up_delta;
+ up_delta = tmp;
+ }
+
+ int dn = line->y_min >> 6, up = line->y_max >> 6;
+ int16_t dn_pos = line->y_min & 63;
+ int16_t dn_delta1 = dn_delta * dn_pos;
+ int16_t up_pos = line->y_max & 63;
+ int16_t up_delta1 = up_delta * up_pos;
+ delta[dn + 1] -= dn_delta1;
+ delta[dn] -= (dn_delta << 6) - dn_delta1;
+ delta[up + 1] += up_delta1;
+ delta[up] += (up_delta << 6) - up_delta1;
+ if (line->y_min == line->y_max)
+ continue;
+
+ int16_t a = (line->a * (int64_t)line->scale + ((int64_t)1 << 50)) >> 51;
+ int16_t b = (line->b * (int64_t)line->scale + ((int64_t)1 << 50)) >> 51;
+ int16_t c = ((int32_t)(line->c >> 12) * (int64_t)line->scale + ((int64_t)1 << 44)) >> 45;
+ c -= (a >> 1) + b * dn;
+
+ int16_t va[32];
+ for (i = 0; i < 32; ++i)
+ va[i] = a * i;
+ int16_t abs_a = a < 0 ? -a : a;
+ int16_t abs_b = b < 0 ? -b : b;
+ int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2;
+ int16_t base = (1 << 8) - (b >> 1);
+ int16_t dc1 = base + dc;
+ int16_t dc2 = base - dc;
+
+ if (dn_pos) {
+ if (up == dn) {
+ update_border_line32(res[dn], abs_a, va, b, abs_b, c, dn_pos, up_pos);
+ continue;
+ }
+ update_border_line32(res[dn], abs_a, va, b, abs_b, c, dn_pos, 64);
+ dn++;
+ c -= b;
+ }
+ for (j = dn; j < up; ++j) {
+ for (i = 0; i < 32; ++i) {
+ int16_t c1 = c - va[i] + dc1;
+ int16_t c2 = c - va[i] + dc2;
+ c1 = FFMINMAX(c1, 0, full);
+ c2 = FFMINMAX(c2, 0, full);
+ res[j][i] += (c1 + c2) >> 2;
+ }
+ c -= b;
+ }
+ if (up_pos)
+ update_border_line32(res[up], abs_a, va, b, abs_b, c, 0, up_pos);
+ }
+
+ int16_t cur = winding << 8;
+ for (j = 0; j < 32; ++j) {
+ cur += delta[j];
+ for (i = 0; i < 32; ++i) {
+ int16_t val = res[j][i] + cur, neg_val = -val;
+ val = (val > neg_val ? val : neg_val);
+ buf[i] = FFMIN(val, 255);
+ }
+ buf += stride;
+ }
+}
#include "x86/blend_bitmaps.h"
#include "x86/be_blur.h"
+#include "x86/rasterizer.h"
#endif // ASM
#endif
priv->restride_bitmap_func = restride_bitmap_c;
+#if CONFIG_RASTERIZER
+#if CONFIG_LARGE_TILES
+ priv->rasterizer.tile_order = 5;
+ #if (defined(__i386__) || defined(__x86_64__)) && CONFIG_ASM
+ priv->rasterizer.fill_solid = avx2 ? ass_fill_solid_tile32_avx2 :
+ (sse2 ? ass_fill_solid_tile32_sse2 : ass_fill_solid_tile32_c);
+ priv->rasterizer.fill_halfplane = avx2 ? ass_fill_halfplane_tile32_avx2 :
+ (sse2 ? ass_fill_halfplane_tile32_sse2 : ass_fill_halfplane_tile32_c);
+ priv->rasterizer.fill_generic = avx2 ? ass_fill_generic_tile32_avx2 :
+ (sse2 ? ass_fill_generic_tile32_sse2 : ass_fill_generic_tile32_c);
+ #else
+ priv->rasterizer.fill_solid = ass_fill_solid_tile32_c;
+ priv->rasterizer.fill_halfplane = ass_fill_halfplane_tile32_c;
+ priv->rasterizer.fill_generic = ass_fill_generic_tile32_c;
+ #endif
+#else
+ priv->rasterizer.tile_order = 4;
+ #if (defined(__i386__) || defined(__x86_64__)) && CONFIG_ASM
+ priv->rasterizer.fill_solid = avx2 ? ass_fill_solid_tile16_avx2 :
+ (sse2 ? ass_fill_solid_tile16_sse2 : ass_fill_solid_tile16_c);
+ priv->rasterizer.fill_halfplane = avx2 ? ass_fill_halfplane_tile16_avx2 :
+ (sse2 ? ass_fill_halfplane_tile16_sse2 : ass_fill_halfplane_tile16_c);
+ priv->rasterizer.fill_generic = avx2 ? ass_fill_generic_tile16_avx2 :
+ (sse2 ? ass_fill_generic_tile16_sse2 : ass_fill_generic_tile16_c);
+ #else
+ priv->rasterizer.fill_solid = ass_fill_solid_tile16_c;
+ priv->rasterizer.fill_halfplane = ass_fill_halfplane_tile16_c;
+ priv->rasterizer.fill_generic = ass_fill_generic_tile16_c;
+ #endif
+#endif
+ priv->rasterizer.outline_error = 16;
+ rasterizer_init(&priv->rasterizer);
+#endif
+
priv->cache.font_cache = ass_font_cache_create();
priv->cache.bitmap_cache = ass_bitmap_cache_create();
priv->cache.composite_cache = ass_composite_cache_create();
ass_free_images(render_priv->images_root);
ass_free_images(render_priv->prev_images_root);
+#if CONFIG_RASTERIZER
+ rasterizer_done(&render_priv->rasterizer);
+#endif
+
if (render_priv->state.stroker) {
FT_Stroker_Done(render_priv->state.stroker);
render_priv->state.stroker = 0;
FT_Outline_Translate(outline, trans.x, trans.y);
}
- clip_bm = outline_to_bitmap(render_priv->library,
- render_priv->ftlibrary, outline, 0);
+ clip_bm = outline_to_bitmap(render_priv, outline, 0);
// Add to cache
memset(&v, 0, sizeof(v));
}
// render glyph
- error = outline_to_bitmap3(render_priv->library,
- render_priv->synth_priv,
- render_priv->ftlibrary,
+ error = outline_to_bitmap3(render_priv,
outline, border,
&hash_val.bm, &hash_val.bm_o,
&hash_val.bm_s, info->be,
#include "ass_library.h"
#include "ass_drawing.h"
#include "ass_bitmap.h"
+#include "ass_rasterizer.h"
#define GLYPH_CACHE_MAX 10000
#define BITMAP_CACHE_MAX_SIZE 500 * 1048576
TextInfo text_info;
CacheStore cache;
+#if CONFIG_RASTERIZER
+ ASS_Rasterizer rasterizer;
+#endif
BitmapBlendFunc add_bitmaps_func;
BitmapBlendFunc sub_bitmaps_func;
BitmapMulFunc mul_bitmaps_func;
--- /dev/null
+;******************************************************************************
+;* rasterizer.asm: SSE2 tile rasterization functions
+;******************************************************************************
+;* Copyright (C) 2014 Vabishchevich Nikolay <vabnick@gmail.com>
+;*
+;* This file is part of libass.
+;*
+;* Permission to use, copy, modify, and distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;******************************************************************************
+
+%include "x86inc.asm"
+
+%if ARCH_X86_64
+DEFAULT REL
+%endif
+
+SECTION_RODATA 32
+
+words_index: dw 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
+words_tile16: dw 1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024
+words_tile32: dw 512,512,512,512,512,512,512,512,512,512,512,512,512,512,512,512
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; MUL reg, num
+; Multiply by constant
+;------------------------------------------------------------------------------
+
+%macro MUL 2
+%if (%2) == 0
+ xor %1, %1
+%elif (%2) == 1
+%elif (%2) == 2
+ add %1, %1 ; lea %1, [%1 + %1]
+%elif (%2) == 3
+ lea %1, [%1 + 2 * %1]
+%elif (%2) == 4
+ lea %1, [4 * %1] ; shl %1, 2
+%elif (%2) == 5
+ lea %1, [%1 + 4 * %1]
+%elif (%2) == 8
+ lea %1, [8 * %1] ; shl %1, 3
+%elif (%2) == 9
+ lea %1, [%1 + 8 * %1]
+%elif (%2) == 16
+ shl %1, 4
+%elif (%2) == 32
+ shl %1, 5
+%elif (%2) == 64
+ shl %1, 6
+%elif (%2) == 128
+ shl %1, 7
+%elif (%2) == 256
+ shl %1, 8
+%else
+ imul %1, %2
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; BCASTW m_dst, r_src
+;------------------------------------------------------------------------------
+
+%macro BCASTW 2
+ movd xm%1, %2
+%if mmsize == 32
+ vpbroadcastw m%1, xm%1
+%elif mmsize == 16
+ punpcklwd m%1, m%1
+ pshufd m%1, m%1, q0000
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; PABSW m_reg, m_tmp
+;------------------------------------------------------------------------------
+
+%macro PABSW 2
+%if cpuflag(ssse3)
+ pabsw m%1, m%1
+%else
+ pxor m%2, m%2
+ psubw m%2, m%1
+ pmaxsw m%1, m%2
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; FILL_LINE r_dst, src, size
+;------------------------------------------------------------------------------
+
+%macro FILL_LINE 3
+%if ((%3) & (mmsize - 1)) == 0
+ %assign %%i 0
+ %rep (%3) / mmsize
+ mova [%1 + %%i], m%2
+ %assign %%i %%i + mmsize
+ %endrep
+%elif (%3) == 16
+ mova [%1], xm%2
+%else
+ %error "invalid line size"
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; FILL_SOLID_TILE tile_order, suffix
+; void fill_solid_tile%2(uint8_t *buf, ptrdiff_t stride);
+;------------------------------------------------------------------------------
+
+%macro FILL_SOLID_TILE 2
+cglobal fill_solid_tile%2, 2,2,1
+ pcmpeqd m0, m0
+%rep (1 << %1) - 1
+ FILL_LINE r0, 0, 1 << %1
+ add r0, r1
+%endrep
+ FILL_LINE r0, 0, 1 << %1
+ RET
+%endmacro
+
+INIT_XMM sse2
+FILL_SOLID_TILE 4,16
+FILL_SOLID_TILE 5,32
+INIT_YMM avx2
+FILL_SOLID_TILE 4,16
+FILL_SOLID_TILE 5,32
+
+;------------------------------------------------------------------------------
+; CALC_LINE tile_order, m_dst, m_src, m_delta, m_zero, m_full, m_tmp
+; Calculate line using antialiased halfplane algorithm
+;------------------------------------------------------------------------------
+
+%macro CALC_LINE 7
+ paddw m%7, m%3, m%4
+ pmaxsw m%2, m%3, m%5
+ pmaxsw m%7, m%5
+ pminsw m%2, m%6
+ pminsw m%7, m%6
+ paddw m%2, m%7
+ psraw m%2, 7 - %1
+%endmacro
+
+;------------------------------------------------------------------------------
+; DEF_A_SHIFT tile_order
+; If single mm-register is enough to store the whole line
+; then sets a_shift = 0,
+; else sets a_shift = log2(mmsize / sizeof(int16_t)).
+;------------------------------------------------------------------------------
+
+%macro DEF_A_SHIFT 1
+%if mmsize >= (2 << %1)
+ %define a_shift 0
+%elif mmsize == 32
+ %define a_shift 4
+%elif mmsize == 16
+ %define a_shift 3
+%else
+ %error "invalid mmsize"
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; FILL_HALFPLANE_TILE tile_order, suffix
+; void fill_halfplane_tile%2(uint8_t *buf, ptrdiff_t stride,
+; int32_t a, int32_t b, int64_t c, int32_t scale);
+;------------------------------------------------------------------------------
+
+%macro FILL_HALFPLANE_TILE 2
+ DEF_A_SHIFT %1
+%if ARCH_X86_64 && a_shift
+cglobal fill_halfplane_tile%2, 6,7,9
+%else
+cglobal fill_halfplane_tile%2, 6,7,8
+%endif
+%if a_shift == 0
+ SWAP 3, 8
+%endif
+
+%if ARCH_X86_64
+ movsxd r2, r2d ; a
+ movsxd r3, r3d ; b
+ sar r4, 7 + %1 ; c >> (tile_order + 7)
+ movsxd r5, r5d ; scale
+ mov r6, 1 << (45 + %1)
+ imul r2, r5
+ add r2, r6
+ sar r2, 46 + %1 ; aa
+ imul r3, r5
+ add r3, r6
+ sar r3, 46 + %1 ; bb
+ imul r4, r5
+ shr r6, 1 + %1
+ add r4, r6
+ sar r4, 45 ; cc
+%else
+ mov r0d, r4m ; c_lo
+ mov r2d, r5m ; c_hi
+ mov r1d, r6m ; scale
+ mov r5d, 1 << 12
+ shr r0d, 7 + %1
+ shl r2d, 25 - %1
+ or r0d, r2d ; r0d (eax) = c >> (tile_order + 7)
+ imul r1d ; r2d (edx) = (c >> ...) * scale >> 32
+ add r2d, r5d
+ sar r2d, 13
+ mov r4d, r2d ; cc
+ shl r5d, 1 + %1
+ mov r0d, r3m ; r0d (eax) = b
+ imul r1d ; r2d (edx) = b * scale >> 32
+ add r2d, r5d
+ sar r2d, 14 + %1
+ mov r3d, r2d ; bb
+ mov r0d, r2m ; r0d (eax) = a
+ imul r1d ; r2d (edx) = a * scale >> 32
+ add r2d, r5d
+ sar r2d, 14 + %1 ; aa
+ mov r0d, r0m
+ mov r1d, r1m
+%endif
+ add r4d, 1 << (13 - %1)
+ mov r6d, r2d
+ add r6d, r3d
+ sar r6d, 1
+ sub r4d, r6d
+
+ BCASTW 1, r4d ; cc
+ BCASTW 2, r2d ; aa
+%if a_shift
+ psllw m3, m2, a_shift ; aa * (mmsize / 2)
+%endif
+ pmullw m2, [words_index]
+ psubw m1, m2 ; cc - aa * i
+
+ mov r4d, r2d ; aa
+ mov r6d, r4d
+ sar r6d, 31
+ xor r4d, r6d
+ sub r4d, r6d ; abs_a
+ mov r5d, r3d ; bb
+ mov r6d, r5d
+ sar r6d, 31
+ xor r5d, r6d
+ sub r5d, r6d ; abs_b
+ cmp r4d, r5d
+ cmovg r4d, r5d
+ add r4d, 2
+ sar r4d, 2 ; delta
+ BCASTW 2, r4d
+ psubw m1, m2 ; c1 = cc - aa * i - delta
+ paddw m2, m2 ; 2 * delta
+
+%if a_shift
+ MUL r2d, (1 << %1) - (mmsize / 2)
+ sub r3d, r2d ; bb - (tile_size - mmsize / 2) * aa
+%endif
+%if ARCH_X86_64 || a_shift == 0
+ BCASTW 8, r3d
+%endif
+
+ pxor m0, m0
+ mova m4, [words_tile%2]
+ mov r2d, (1 << %1)
+ jmp .loop_entry
+
+.loop_start
+ add r0, r1
+%if ARCH_X86_64 || a_shift == 0
+ psubw m1, m8
+%else
+ BCASTW 7, r3d
+ psubw m1, m7
+%endif
+.loop_entry
+%assign i 0
+%rep (1 << %1) / mmsize
+%if i
+ psubw m1, m3
+%endif
+ CALC_LINE %1, 5, 1,2, 0,4, 7
+ psubw m1, m3
+ CALC_LINE %1, 6, 1,2, 0,4, 7
+ packuswb m5, m6
+%if mmsize == 32
+ vpermq m5, m5, q3120
+%endif
+ mova [r0 + i], m5
+%assign i i + mmsize
+%endrep
+%if (1 << %1) < mmsize
+ CALC_LINE %1, 5, 1,2, 0,4, 7
+ packuswb m5, m6
+ vpermq m5, m5, q3120
+ mova [r0 + i], xm5
+%endif
+ sub r2d,1
+ jnz .loop_start
+ RET
+%endmacro
+
+INIT_XMM sse2
+FILL_HALFPLANE_TILE 4,16
+FILL_HALFPLANE_TILE 5,32
+INIT_YMM avx2
+FILL_HALFPLANE_TILE 4,16
+FILL_HALFPLANE_TILE 5,32
+
+;------------------------------------------------------------------------------
+; struct segment {
+; int64_t c;
+; int32_t a, b, scale, flags;
+; int32_t x_min, x_max, y_min, y_max;
+; };
+;------------------------------------------------------------------------------
+
+struc line
+ .c: resq 1
+ .a: resd 1
+ .b: resd 1
+ .scale: resd 1
+ .flags: resd 1
+ .x_min: resd 1
+ .x_max: resd 1
+ .y_min: resd 1
+ .y_max: resd 1
+endstruc
+
+;------------------------------------------------------------------------------
+; ZEROFILL dst, size, tmp1
+;------------------------------------------------------------------------------
+
+%macro ZEROFILL 3
+%assign %%n 128 / mmsize
+ mov %3, (%2) / 128
+%%zerofill_loop:
+%assign %%i 0
+%rep %%n
+ mova [%1 + %%i], mm_zero
+%assign %%i %%i + mmsize
+%endrep
+ add %1, 128
+ sub %3, 1
+ jnz %%zerofill_loop
+%assign %%i 0
+%rep ((%2) / mmsize) & (%%n - 1)
+ mova [%1 + %%i], mm_zero
+%assign %%i %%i + mmsize
+%endrep
+%endmacro
+
+;------------------------------------------------------------------------------
+; CALC_DELTA_FLAG res, line, tmp1, tmp2
+; Set bits of result register (res):
+; bit 3 - for nonzero dn_delta,
+; bit 2 - for nonzero up_delta.
+;------------------------------------------------------------------------------
+
+%macro CALC_DELTA_FLAG 4
+ mov %3d, [%2 + line.flags]
+ xor %4d, %4d
+ cmp %4d, [%2 + line.x_min]
+ cmovz %4d, %3d
+ xor %1d, %1d
+ test %3d, 2 ; SEGFLAG_UR_DL
+ cmovnz %1d, %4d
+ shl %3d, 2
+ xor %1d, %3d
+ and %4d, 4
+ and %1d, 4
+ lea %1d, [%1d + 2 * %1d]
+ xor %1d, %4d
+%endmacro
+
+;------------------------------------------------------------------------------
+; UPDATE_DELTA up/dn, dst, flag, pos, tmp
+; Update delta array
+;------------------------------------------------------------------------------
+
+%macro UPDATE_DELTA 5
+%ifidn %1, up
+ %define %%op add
+ %define %%opi sub
+ %assign %%flag 1 << 2
+%elifidn %1, dn
+ %define %%op sub
+ %define %%opi add
+ %assign %%flag 1 << 3
+%else
+ %error "up/dn expected"
+%endif
+
+ test %3d, %%flag
+ jz %%skip
+ lea %5d, [4 * %4d - 256]
+ %%opi [%2], %5w
+ lea %5d, [4 * %4d]
+ %%op [%2 + 2], %5w
+%%skip:
+%endmacro
+
+;------------------------------------------------------------------------------
+; CALC_VBA tile_order, b
+; Calculate b - (tile_size - (mmsize / sizeof(int16_t))) * a
+;------------------------------------------------------------------------------
+
+%macro CALC_VBA 2
+ BCASTW m_vba, %2d
+%rep (2 << %1) / mmsize - 1
+ psubw mm_vba, mm_van
+%endrep
+%endmacro
+
+;------------------------------------------------------------------------------
+; FILL_BORDER_LINE tile_order, res, abs_a(abs_ab), b, [abs_b], size, sum,
+; tmp8, tmp9, mt10, mt11, mt12, mt13, mt14, [mt15]
+; Render top/bottom line of the trapezium with antialiasing
+;------------------------------------------------------------------------------
+
+%macro FILL_BORDER_LINE 15
+ mov %8d, %6d
+ shl %8d, 8 - %1 ; size << (8 - tile_order)
+ xor %9d, %9d
+%if ARCH_X86_64
+ sub %8d, %3d ; abs_a
+ cmovg %8d, %9d
+ add %8d, 1 << (14 - %1)
+ shl %8d, 2 * %1 - 5 ; w
+ BCASTW %15, %8d
+
+ mov %9d, %5d ; abs_b
+ imul %9d, %6d
+ sar %9d, 6 ; dc_b
+ cmp %9d, %3d ; abs_a
+ cmovg %9d, %3d
+%else
+ sub %8w, %3w ; abs_a
+ cmovg %8d, %9d
+ add %8w, 1 << (14 - %1)
+ shl %8d, 2 * %1 - 5 ; w
+
+ mov %9d, %3d ; abs_ab
+ shr %9d, 16 ; abs_b
+ imul %9d, %6d
+ sar %9d, 6 ; dc_b
+ cmp %9w, %3w
+ cmovg %9w, %3w
+%endif
+ add %9d, 2
+ sar %9d, 2 ; dc
+
+ imul %7d, %4d ; sum * b
+ sar %7d, 7 ; avg * b
+ add %7d, %9d ; avg * b + dc
+ add %9d, %9d ; 2 * dc
+
+ imul %7d, %8d
+ sar %7d, 16
+ sub %7d, %6d ; -offs1
+ BCASTW %10, %7d
+ imul %9d, %8d
+ sar %9d, 16 ; offs2 - offs1
+ BCASTW %11, %9d
+ add %6d, %6d
+ BCASTW %12, %6d
+
+%assign %%i 0
+%rep (2 << %1) / mmsize
+%if %%i
+ psubw mm_c, mm_van
+%endif
+%if ARCH_X86_64
+ pmulhw m%13, mm_c, m%15
+%else
+ BCASTW %14, %8d
+ pmulhw m%13, mm_c, m%14
+%endif
+ psubw m%13, m%10 ; c1
+ paddw m%14, m%13, m%11 ; c2
+ pmaxsw m%13, mm_zero
+ pmaxsw m%14, mm_zero
+ pminsw m%13, m%12
+ pminsw m%14, m%12
+ paddw m%13, m%14
+ paddw m%13, [%2 + %%i]
+ mova [%2 + %%i], m%13
+%assign %%i %%i + mmsize
+%endrep
+%endmacro
+
+;------------------------------------------------------------------------------
+; SAVE_RESULT tile_order, buf, stride, src, delta,
+; tmp6, tmp7, mt8, mt9, mt10, mt11
+; Convert and store internal buffer (with delta array) in the result buffer
+;------------------------------------------------------------------------------
+
+%macro SAVE_RESULT 11
+ mov %6d, 1 << %1
+ xor %7d, %7d
+%%save_loop:
+ add %7w, [%5]
+ BCASTW %10, %7d
+ add %5, 2
+
+%assign %%i 0
+%rep (1 << %1) / mmsize
+ paddw m%8, m%10, [%4 + 2 * %%i]
+ PABSW %8, %11
+ paddw m%9, m%10, [%4 + 2 * %%i + mmsize]
+ PABSW %9, %11
+ packuswb m%8, m%9
+%if mmsize == 32
+ vpermq m%8, m%8, q3120
+%endif
+ mova [%2 + %%i], m%8
+%assign %%i %%i + mmsize
+%endrep
+%if (1 << %1) < mmsize
+ paddw m%8, m%10, [%4 + 2 * %%i]
+ PABSW %8, %11
+ packuswb m%8, m%8
+ vpermq m%8, m%8, q3120
+ mova [%2 + %%i], xm%8
+%endif
+
+ add %2, %3
+ add %4, 2 << %1
+ sub %6d, 1
+ jnz %%save_loop
+%endmacro
+
+;------------------------------------------------------------------------------
+; GET_RES_ADDR dst
+; CALC_RES_ADDR tile_order, dst/index, tmp, [skip_calc]
+; Calculate position of line in the internal buffer
+;------------------------------------------------------------------------------
+
+%macro GET_RES_ADDR 1
+%if mmsize <= 16 && HAVE_ALIGNED_STACK
+ mov %1, rstk
+%else
+ lea %1, [rstk + mmsize - 1]
+ and %1, ~(mmsize - 1)
+%endif
+%endmacro
+
+%macro CALC_RES_ADDR 3-4 noskip
+ shl %2d, 1 + %1
+%if mmsize <= 16 && HAVE_ALIGNED_STACK
+ add %2, rstk
+%else
+%ifidn %4, noskip
+ lea %3, [rstk + mmsize - 1]
+ and %3, ~(mmsize - 1)
+%endif
+ add %2, %3
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; FILL_GENERIC_TILE tile_order, suffix
+; void fill_generic_tile%2(uint8_t *buf, ptrdiff_t stride,
+; const struct segment *line, size_t n_lines,
+; int winding);
+;------------------------------------------------------------------------------
+
+%macro FILL_GENERIC_TILE 2
+ ; t3=line t4=dn/cur t5=up/end t6=up_pos t7=dn_pos
+ ; t8=a/abs_a/abs_ab t9=b t10=c/abs_b
+%if ARCH_X86_64
+ DECLARE_REG_TMP 10,11,5,2, 4,9,6,7, 8,12,13
+%else
+ DECLARE_REG_TMP 0,1,5,3, 4,6,6,0, 2,3,5
+%endif
+
+ %assign tile_size 1 << %1
+ %assign delta_offs 2 * tile_size * tile_size
+ %assign alloc_size 2 * tile_size * (tile_size + 1) + 4
+ %assign buf_size 2 * tile_size * (tile_size + 1)
+ DEF_A_SHIFT %1
+
+%if ARCH_X86_64
+ %define m_zero 6
+ %define m_full 7
+ %define mm_index m8
+ %define m_c 9
+ %define m_vba 10
+%if a_shift
+ %define m_van 11
+cglobal fill_generic_tile%2, 5,14,12
+%else
+cglobal fill_generic_tile%2, 5,14,11
+%endif
+
+%else
+ %define m_zero 5
+ %define m_full 4 ; tmp
+ %define mm_index [words_index]
+ %define m_c 7
+%if a_shift
+ %define m_van 6
+ %define m_vba 3 ; tmp
+%else
+ %define m_vba 6
+%endif
+
+ %assign alloc_size alloc_size + 8
+cglobal fill_generic_tile%2, 0,7,8
+%endif
+
+ %define mm_zero m %+ m_zero
+ %define mm_full m %+ m_full
+ %define mm_c m %+ m_c
+ %define mm_vba m %+ m_vba
+%if a_shift
+ %define mm_van m %+ m_van
+%endif
+
+%if mmsize <= 16 && HAVE_ALIGNED_STACK
+ %assign alloc_size alloc_size + stack_offset + gprsize + (mmsize - 1)
+ %assign alloc_size (alloc_size & ~(mmsize - 1)) - stack_offset - gprsize
+%else
+ %assign alloc_size alloc_size + 2 * mmsize
+ %assign delta_offs delta_offs + mmsize
+ %assign buf_size buf_size + mmsize
+%endif
+ SUB rstk, alloc_size
+
+ GET_RES_ADDR t0
+ pxor mm_zero, mm_zero
+ ZEROFILL t0, buf_size, t1
+
+%if ARCH_X86_64 == 0
+ mov r4d, r4m
+%endif
+ shl r4d, 8
+ mov [rstk + delta_offs], r4w
+
+%if ARCH_X86_64
+ mova mm_index, [words_index]
+ mova mm_full, [words_tile%2]
+ %define up_addr t5
+%else
+ %define up_addr [rstk + delta_offs + 2 * tile_size + 4]
+ %define up_pos [rstk + delta_offs + 2 * tile_size + 8]
+%endif
+
+.line_loop
+%if ARCH_X86_64 == 0
+ mov t3, r2m
+ lea t0, [t3 + line_size]
+ mov r2m, t0
+%endif
+ CALC_DELTA_FLAG t0, t3, t1,t2
+
+ mov t4d, [t3 + line.y_min]
+ mov t2d, [t3 + line.y_max]
+%if ARCH_X86_64
+ mov t8d, t4d
+ mov t6d, t4d
+ and t6d, 63 ; dn_pos
+ shr t4d, 6 ; dn
+ mov t5d, t2d
+ mov t7d, t2d
+ and t7d, 63 ; up_pos
+ shr t5d, 6 ; up
+
+ UPDATE_DELTA dn, rstk + 2 * t4 + delta_offs, t0,t6, t1
+ UPDATE_DELTA up, rstk + 2 * t5 + delta_offs, t0,t7, t1
+ cmp t8d, t2d
+%else
+ lea t1d, [t0d + 1]
+ cmp t4d, t2d
+ cmovnz t0d, t1d ; bit 0 -- not horz line
+
+ mov t6d, t2d
+ and t6d, 63 ; up_pos
+ shr t2d, 6 ; up
+ UPDATE_DELTA up, rstk + 2 * t2 + delta_offs, t0,t6, t1
+
+ CALC_RES_ADDR %1, t2, t1
+ mov up_addr, t2
+ mov up_pos, t6d
+
+ mov t6d, t4d
+ and t6d, 63 ; dn_pos
+ shr t4d, 6 ; dn
+ UPDATE_DELTA dn, rstk + 2 * t4 + delta_offs, t0,t6, t1
+ test t0d, 1
+%endif
+ jz .end_line_loop
+
+%if ARCH_X86_64
+ movsxd t8, dword [t3 + line.a]
+ movsxd t9, dword [t3 + line.b]
+ mov t10, [t3 + line.c]
+ sar t10, 7 + %1 ; c >> (tile_order + 7)
+ movsxd t0, dword [t3 + line.scale]
+ mov t1, 1 << (45 + %1)
+ imul t8, t0
+ add t8, t1
+ sar t8, 46 + %1 ; a
+ imul t9, t0
+ add t9, t1
+ sar t9, 46 + %1 ; b
+ imul t10, t0
+ shr t1, 1 + %1
+ add t10, t1
+ sar t10, 45 ; c
+%else
+ mov r0d, [t3 + line.c]
+ mov r2d, [t3 + line.c + 4]
+ mov r1d, [t3 + line.scale]
+ shr r0d, 7 + %1
+ shl r2d, 25 - %1
+ or r0d, r2d ; r0d (eax) = c >> (tile_order + 7)
+ imul r1d ; r2d (edx) = (c >> ...) * scale >> 32
+ add r2d, 1 << 12
+ sar r2d, 13
+ mov t10d, r2d ; c
+ mov r0d, [t3 + line.b] ; r0d (eax)
+ imul r1d ; r2d (edx) = b * scale >> 32
+ add r2d, 1 << (13 + %1)
+ sar r2d, 14 + %1
+ mov r0d, [t3 + line.a] ; r0d (eax)
+ mov t9d, r2d ; b (overrides t3)
+ imul r1d ; r2d (edx) = a * scale >> 32
+ add r2d, 1 << (13 + %1)
+ sar r2d, 14 + %1 ; a (t8d)
+%endif
+
+ mov t0d, t8d ; a
+ sar t0d, 1
+ sub t10d, t0d
+ mov t0d, t9d ; b
+ imul t0d, t4d
+ sub t10d, t0d
+ BCASTW m_c, t10d
+
+ BCASTW 0, t8d
+%if a_shift
+ psllw mm_van, m0, a_shift ; a * (mmsize / 2)
+%endif
+ pmullw m0, mm_index
+ psubw mm_c, m0 ; c - a * i
+
+ mov t0d, t8d ; a
+ sar t0d, 31
+ xor t8d, t0d
+ sub t8d, t0d ; abs_a
+ mov t0d, t9d ; b
+ mov t10d, t9d
+ sar t0d, 31
+ xor t10d, t0d
+ sub t10d, t0d ; abs_b
+%if ARCH_X86_64 == 0
+ shl t10d, 16
+ or t8d, t10d ; abs_ab
+%endif
+
+ CALC_RES_ADDR %1, t4, t0
+%if ARCH_X86_64
+ CALC_RES_ADDR %1, t5, t0, skip
+%endif
+ cmp t4, up_addr
+ jz .single_line
+
+%if ARCH_X86_64 || a_shift == 0
+ CALC_VBA %1, t9
+%endif
+
+ test t6d, t6d
+ jz .generic_fist
+ mov t2d, 64
+ sub t2d, t6d ; 64 - dn_pos
+ add t6d, 64 ; 64 + dn_pos
+ FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5
+
+%if ARCH_X86_64 == 0
+ mov t5, up_addr
+%if a_shift
+ CALC_VBA %1, t9
+%endif
+%endif
+
+ psubw mm_c, mm_vba
+ add t4, 2 << %1
+ cmp t4, t5
+ jge .end_loop
+%if ARCH_X86_64 == 0
+ jmp .bulk_fill
+%endif
+
+.generic_fist
+%if ARCH_X86_64 == 0
+ mov t5, up_addr
+%if a_shift
+ CALC_VBA %1, t9
+%endif
+%endif
+
+.bulk_fill
+ mov t2d, 1 << (13 - %1)
+ mov t0d, t9d ; b
+ sar t0d, 1
+ sub t2d, t0d ; base
+%if ARCH_X86_64
+ mov t0d, t10d ; abs_b
+ cmp t0d, t8d ; abs_a
+ cmovg t0d, t8d
+%else
+ mov t0d, t8d ; abs_ab
+ shr t0d, 16 ; abs_b
+ cmp t0w, t8w
+ cmovg t0w, t8w
+%endif
+ add t0d, 2
+ sar t0d, 2 ; dc
+%if ARCH_X86_64
+ sub t2d, t0d ; base - dc
+%else
+ sub t2w, t0w ; base - dc
+%endif
+ add t0d, t0d ; 2 * dc
+ BCASTW 2, t0d
+
+%if ARCH_X86_64
+ BCASTW 3, t2d
+ paddw mm_c, m3
+%else
+ BCASTW 0, t2d
+ paddw mm_c, m0
+
+ mova mm_full, [words_tile%2]
+%endif
+.internal_loop
+%assign i 0
+%rep (2 << %1) / mmsize
+%if i
+ psubw mm_c, mm_van
+%endif
+ CALC_LINE %1, 0, m_c,2, m_zero,m_full, 1
+ paddw m0, [t4 + i]
+ mova [t4 + i], m0
+%assign i i + mmsize
+%endrep
+ psubw mm_c, mm_vba
+ add t4, 2 << %1
+ cmp t4, t5
+ jl .internal_loop
+%if ARCH_X86_64
+ psubw mm_c, m3
+%else
+ BCASTW 0, t2d
+ psubw mm_c, m0
+%endif
+
+.end_loop
+%if ARCH_X86_64
+ test t7d, t7d
+ jz .end_line_loop
+ xor t6d, t6d
+%else
+ mov t2d, up_pos
+ test t2d, t2d
+ jz .end_line_loop
+ mov t6d, t2d
+ jmp .last_line
+%endif
+
+.single_line
+%if ARCH_X86_64 == 0
+ mov t7d, up_pos
+%endif
+ mov t2d, t7d
+ sub t2d, t6d ; up_pos - dn_pos
+ add t6d, t7d ; up_pos + dn_pos
+.last_line
+ FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5
+
+.end_line_loop
+%if ARCH_X86_64
+ add r2, line_size
+ sub r3, 1
+%else
+ sub dword r3m, 1
+%endif
+ jnz .line_loop
+
+%if ARCH_X86_64 == 0
+ mov r0, r0m
+ mov r1, r1m
+%endif
+ GET_RES_ADDR r2
+ lea r3, [rstk + delta_offs]
+ SAVE_RESULT %1, r0,r1,r2,r3, r4,t2, 0,1,2,3
+ ADD rstk, alloc_size
+ RET
+%endmacro
+
+INIT_XMM sse2
+FILL_GENERIC_TILE 4,16
+FILL_GENERIC_TILE 5,32
+INIT_YMM avx2
+FILL_GENERIC_TILE 4,16
+FILL_GENERIC_TILE 5,32
--- /dev/null
+/*
+ * Copyright (C) 2014 Vabishchevich Nikolay <vabnick@gmail.com>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef X86_RASTERIZER_H
+#define X86_RASTERIZER_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+struct segment;
+
+void ass_fill_solid_tile16_sse2(uint8_t *buf, ptrdiff_t stride);
+void ass_fill_solid_tile32_sse2(uint8_t *buf, ptrdiff_t stride);
+void ass_fill_halfplane_tile16_sse2(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale);
+void ass_fill_halfplane_tile32_sse2(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale);
+void ass_fill_generic_tile16_sse2(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding);
+void ass_fill_generic_tile32_sse2(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding);
+
+void ass_fill_solid_tile16_avx2(uint8_t *buf, ptrdiff_t stride);
+void ass_fill_solid_tile32_avx2(uint8_t *buf, ptrdiff_t stride);
+void ass_fill_halfplane_tile16_avx2(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale);
+void ass_fill_halfplane_tile32_avx2(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale);
+void ass_fill_generic_tile16_avx2(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding);
+void ass_fill_generic_tile32_avx2(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding);
+
+
+#endif /* X86_RASTERIZER_H */
+