initial commit, 4.5 stable

2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwCommon.h
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwCommon.h
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TVG_SW_COMMON_H_
+#define _TVG_SW_COMMON_H_
+
+#include "tvgCommon.h"
+#include "tvgMath.h"
+#include "tvgRender.h"
+
+#define SW_CURVE_TYPE_POINT 0
+#define SW_CURVE_TYPE_CUBIC 1
+#define SW_ANGLE_PI (180L << 16)
+#define SW_ANGLE_2PI (SW_ANGLE_PI << 1)
+#define SW_ANGLE_PI2 (SW_ANGLE_PI >> 1)
+
+using SwCoord = signed long;
+using SwFixed = signed long long;
+
+
+static inline float TO_FLOAT(SwCoord val)
+{
+    return static_cast<float>(val) / 64.0f;
+}
+
+struct SwPoint
+{
+    SwCoord x, y;
+
+    SwPoint& operator+=(const SwPoint& rhs)
+    {
+        x += rhs.x;
+        y += rhs.y;
+        return *this;
+    }
+
+    SwPoint operator+(const SwPoint& rhs) const
+    {
+        return {x + rhs.x, y + rhs.y};
+    }
+
+    SwPoint operator-(const SwPoint& rhs) const
+    {
+        return {x - rhs.x, y - rhs.y};
+    }
+
+    bool operator==(const SwPoint& rhs) const
+    {
+        return (x == rhs.x && y == rhs.y);
+    }
+
+    bool operator!=(const SwPoint& rhs) const
+    {
+        return (x != rhs.x || y != rhs.y);
+    }
+
+    bool zero() const
+    {
+        if (x == 0 && y == 0) return true;
+        else return false;
+    }
+
+    bool small() const
+    {
+        //2 is epsilon...
+        if (abs(x) < 2 && abs(y) < 2) return true;
+        else return false;
+    }
+
+    Point toPoint() const
+    {
+        return {TO_FLOAT(x),  TO_FLOAT(y)};
+    }
+};
+
+struct SwSize
+{
+    SwCoord w, h;
+};
+
+struct SwOutline
+{
+    Array<SwPoint> pts;             //the outline's points
+    Array<uint32_t> cntrs;          //the contour end points
+    Array<uint8_t> types;           //curve type
+    Array<bool> closed;             //opened or closed path?
+    FillRule fillRule;
+};
+
+struct SwSpan
+{
+    uint16_t x, y;
+    uint16_t len;
+    uint8_t coverage;
+};
+
+struct SwRle
+{
+    SwSpan *spans;
+    uint32_t alloc;
+    uint32_t size;
+};
+
+struct SwBBox
+{
+    SwPoint min, max;
+
+    void reset()
+    {
+        min.x = min.y = max.x = max.y = 0;
+    }
+};
+
+struct SwFill
+{
+    struct SwLinear {
+        float dx, dy;
+        float offset;
+    };
+
+    struct SwRadial {
+        float a11, a12, a13;
+        float a21, a22, a23;
+        float fx, fy, fr;
+        float dx, dy, dr;
+        float invA, a;
+    };
+
+    union {
+        SwLinear linear;
+        SwRadial radial;
+    };
+
+    uint32_t* ctable;
+    FillSpread spread;
+
+    bool solid = false; //solid color fill with the last color from colorStops
+    bool translucent;
+};
+
+struct SwStrokeBorder
+{
+    uint32_t ptsCnt;
+    uint32_t maxPts;
+    SwPoint* pts;
+    uint8_t* tags;
+    int32_t start;     //index of current sub-path start point
+    bool movable;      //true: for ends of lineto borders
+};
+
+struct SwStroke
+{
+    SwFixed angleIn;
+    SwFixed angleOut;
+    SwPoint center;
+    SwFixed lineLength;
+    SwFixed subPathAngle;
+    SwPoint ptStartSubPath;
+    SwFixed subPathLineLength;
+    SwFixed width;
+    SwFixed miterlimit;
+
+    StrokeCap cap;
+    StrokeJoin join;
+    StrokeJoin joinSaved;
+    SwFill* fill = nullptr;
+
+    SwStrokeBorder borders[2];
+
+    float sx, sy;
+
+    bool firstPt;
+    bool closedSubPath;
+    bool handleWideStrokes;
+};
+
+struct SwDashStroke
+{
+    SwOutline* outline = nullptr;
+    float curLen = 0;
+    int32_t curIdx = 0;
+    Point ptStart = {0, 0};
+    Point ptCur = {0, 0};
+    float* pattern = nullptr;
+    uint32_t cnt = 0;
+    bool curOpGap = false;
+    bool move = true;
+};
+
+struct SwShape
+{
+    SwOutline*   outline = nullptr;
+    SwStroke*    stroke = nullptr;
+    SwFill*      fill = nullptr;
+    SwRle*   rle = nullptr;
+    SwRle*   strokeRle = nullptr;
+    SwBBox       bbox;           //Keep it boundary without stroke region. Using for optimal filling.
+
+    bool         fastTrack = false;   //Fast Track: axis-aligned rectangle without any clips?
+};
+
+struct SwImage
+{
+    SwOutline*   outline = nullptr;
+    SwRle*   rle = nullptr;
+    union {
+        pixel_t*  data;      //system based data pointer
+        uint32_t* buf32;     //for explicit 32bits channels
+        uint8_t*  buf8;      //for explicit 8bits grayscale
+    };
+    uint32_t     w, h, stride;
+    int32_t      ox = 0;         //offset x
+    int32_t      oy = 0;         //offset y
+    float        scale;
+    uint8_t      channelSize;
+
+    bool         direct = false;  //draw image directly (with offset)
+    bool         scaled = false;  //draw scaled image
+};
+
+typedef uint8_t(*SwMask)(uint8_t s, uint8_t d, uint8_t a);                  //src, dst, alpha
+typedef uint32_t(*SwBlender)(uint32_t s, uint32_t d, uint8_t a);            //src, dst, alpha
+typedef uint32_t(*SwJoin)(uint8_t r, uint8_t g, uint8_t b, uint8_t a);      //color channel join
+typedef uint8_t(*SwAlpha)(uint8_t*);                                        //blending alpha
+
+struct SwCompositor;
+
+struct SwSurface : RenderSurface
+{
+    SwJoin  join;
+    SwAlpha alphas[4];                    //Alpha:2, InvAlpha:3, Luma:4, InvLuma:5
+    SwBlender blender = nullptr;          //blender (optional)
+    SwCompositor* compositor = nullptr;   //compositor (optional)
+    BlendMethod blendMethod = BlendMethod::Normal;
+
+    SwAlpha alpha(CompositeMethod method)
+    {
+        auto idx = (int)(method) - 2;       //0: None, 1: ClipPath
+        return alphas[idx > 3 ? 0 : idx];   //CompositeMethod has only four Matting methods.
+    }
+
+    SwSurface()
+    {
+    }
+
+    SwSurface(const SwSurface* rhs) : RenderSurface(rhs)
+    {
+        join = rhs->join;
+        memcpy(alphas, rhs->alphas, sizeof(alphas));
+        blender = rhs->blender;
+        compositor = rhs->compositor;
+        blendMethod = rhs->blendMethod;
+     }
+};
+
+struct SwCompositor : RenderCompositor
+{
+    SwSurface* recoverSfc;                  //Recover surface when composition is started
+    SwCompositor* recoverCmp;               //Recover compositor when composition is done
+    SwImage image;
+    SwBBox bbox;
+    bool valid;
+};
+
+struct SwMpool
+{
+    SwOutline* outline;
+    SwOutline* strokeOutline;
+    SwOutline* dashOutline;
+    unsigned allocSize;
+};
+
+static inline SwCoord TO_SWCOORD(float val)
+{
+    return SwCoord(val * 64.0f);
+}
+
+static inline uint32_t JOIN(uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3)
+{
+    return (c0 << 24 | c1 << 16 | c2 << 8 | c3);
+}
+
+static inline uint32_t ALPHA_BLEND(uint32_t c, uint32_t a)
+{
+    ++a;
+    return (((((c >> 8) & 0x00ff00ff) * a) & 0xff00ff00) + ((((c & 0x00ff00ff) * a) >> 8) & 0x00ff00ff));
+}
+
+static inline uint32_t INTERPOLATE(uint32_t s, uint32_t d, uint8_t a)
+{
+    return (((((((s >> 8) & 0xff00ff) - ((d >> 8) & 0xff00ff)) * a) + (d & 0xff00ff00)) & 0xff00ff00) + ((((((s & 0xff00ff) - (d & 0xff00ff)) * a) >> 8) + (d & 0xff00ff)) & 0xff00ff));
+}
+
+static inline uint8_t INTERPOLATE8(uint8_t s, uint8_t d, uint8_t a)
+{
+    return (((s) * (a) + 0xff) >> 8) + (((d) * ~(a) + 0xff) >> 8);
+}
+
+static inline SwCoord HALF_STROKE(float width)
+{
+    return TO_SWCOORD(width * 0.5f);
+}
+
+static inline uint8_t A(uint32_t c)
+{
+    return ((c) >> 24);
+}
+
+static inline uint8_t IA(uint32_t c)
+{
+    return (~(c) >> 24);
+}
+
+static inline uint8_t C1(uint32_t c)
+{
+    return ((c) >> 16);
+}
+
+static inline uint8_t C2(uint32_t c)
+{
+    return ((c) >> 8);
+}
+
+static inline uint8_t C3(uint32_t c)
+{
+    return (c);
+}
+
+static inline uint32_t opBlendInterp(uint32_t s, uint32_t d, uint8_t a)
+{
+    return INTERPOLATE(s, d, a);
+}
+
+static inline uint32_t opBlendNormal(uint32_t s, uint32_t d, uint8_t a)
+{
+    auto t = ALPHA_BLEND(s, a);
+    return t + ALPHA_BLEND(d, IA(t));
+}
+
+static inline uint32_t opBlendPreNormal(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    return s + ALPHA_BLEND(d, IA(s));
+}
+
+static inline uint32_t opBlendSrcOver(uint32_t s, TVG_UNUSED uint32_t d, TVG_UNUSED uint8_t a)
+{
+    return s;
+}
+
+//TODO: BlendMethod could remove the alpha parameter.
+static inline uint32_t opBlendDifference(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    //if (s > d) => s - d
+    //else => d - s
+    auto c1 = (C1(s) > C1(d)) ? (C1(s) - C1(d)) : (C1(d) - C1(s));
+    auto c2 = (C2(s) > C2(d)) ? (C2(s) - C2(d)) : (C2(d) - C2(s));
+    auto c3 = (C3(s) > C3(d)) ? (C3(s) - C3(d)) : (C3(d) - C3(s));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendExclusion(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // (s + d) - (2 * s * d)
+    auto c1 = C1(s) + C1(d) - 2 * MULTIPLY(C1(s), C1(d));
+    tvg::clamp(c1, 0, 255);
+    auto c2 = C2(s) + C2(d) - 2 * MULTIPLY(C2(s), C2(d));
+    tvg::clamp(c2, 0, 255);
+    auto c3 = C3(s) + C3(d) - 2 * MULTIPLY(C3(s), C3(d));
+    tvg::clamp(c3, 0, 255);
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendAdd(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // s + d
+    auto c1 = std::min(C1(s) + C1(d), 255);
+    auto c2 = std::min(C2(s) + C2(d), 255);
+    auto c3 = std::min(C3(s) + C3(d), 255);
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendScreen(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // s + d - s * d
+    auto c1 = C1(s) + C1(d) - MULTIPLY(C1(s), C1(d));
+    auto c2 = C2(s) + C2(d) - MULTIPLY(C2(s), C2(d));
+    auto c3 = C3(s) + C3(d) - MULTIPLY(C3(s), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendMultiply(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // s * d
+    auto c1 = MULTIPLY(C1(s), C1(d));
+    auto c2 = MULTIPLY(C2(s), C2(d));
+    auto c3 = MULTIPLY(C3(s), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendOverlay(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // if (2 * d < da) => 2 * s * d,
+    // else => 1 - 2 * (1 - s) * (1 - d)
+    auto c1 = (C1(d) < 128) ? std::min(255, 2 * MULTIPLY(C1(s), C1(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C1(s), 255 - C1(d))));
+    auto c2 = (C2(d) < 128) ? std::min(255, 2 * MULTIPLY(C2(s), C2(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C2(s), 255 - C2(d))));
+    auto c3 = (C3(d) < 128) ? std::min(255, 2 * MULTIPLY(C3(s), C3(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C3(s), 255 - C3(d))));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendDarken(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // min(s, d)
+    auto c1 = std::min(C1(s), C1(d));
+    auto c2 = std::min(C2(s), C2(d));
+    auto c3 = std::min(C3(s), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendLighten(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // max(s, d)
+    auto c1 = std::max(C1(s), C1(d));
+    auto c2 = std::max(C2(s), C2(d));
+    auto c3 = std::max(C3(s), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendColorDodge(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // d / (1 - s)
+    s = 0xffffffff - s;
+    auto c1 = C1(d) == 0 ? 0 : (C1(s) == 0 ? 255 : std::min(C1(d) * 255 / C1(s), 255));
+    auto c2 = C2(d) == 0 ? 0 : (C2(s) == 0 ? 255 : std::min(C2(d) * 255 / C2(s), 255));
+    auto c3 = C3(d) == 0 ? 0 : (C3(s) == 0 ? 255 : std::min(C3(d) * 255 / C3(s), 255));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendColorBurn(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // 1 - (1 - d) / s
+    auto id = 0xffffffff - d;
+    auto c1 = C1(d) == 255 ? 255 : (C1(s) == 0 ? 0 : 255 - std::min(C1(id) * 255 / C1(s), 255));
+    auto c2 = C2(d) == 255 ? 255 : (C2(s) == 0 ? 0 : 255 - std::min(C2(id) * 255 / C2(s), 255));
+    auto c3 = C3(d) == 255 ? 255 : (C3(s) == 0 ? 0 : 255 - std::min(C3(id) * 255 / C3(s), 255));
+
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendHardLight(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // if (s < sa), (2 * s * d)
+    // else (sa * da) - 2 * (da - s) * (sa - d)
+    auto c1 = (C1(s) < 128) ? std::min(255, 2 * MULTIPLY(C1(s), C1(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C1(s), 255 - C1(d))));
+    auto c2 = (C2(s) < 128) ? std::min(255, 2 * MULTIPLY(C2(s), C2(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C2(s), 255 - C2(d))));
+    auto c3 = (C3(s) < 128) ? std::min(255, 2 * MULTIPLY(C3(s), C3(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C3(s), 255 - C3(d))));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendSoftLight(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    //(255 - 2 * s) * (d * d) + (2 * s * b)
+    auto c1 = MULTIPLY(255 - std::min(255, 2 * C1(s)), MULTIPLY(C1(d), C1(d))) + MULTIPLY(std::min(255, 2 * C1(s)), C1(d));
+    auto c2 = MULTIPLY(255 - std::min(255, 2 * C2(s)), MULTIPLY(C2(d), C2(d))) + MULTIPLY(std::min(255, 2 * C2(s)), C2(d));
+    auto c3 = MULTIPLY(255 - std::min(255, 2 * C3(s)), MULTIPLY(C3(d), C3(d))) + MULTIPLY(std::min(255, 2 * C3(s)), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+
+int64_t mathMultiply(int64_t a, int64_t b);
+int64_t mathDivide(int64_t a, int64_t b);
+int64_t mathMulDiv(int64_t a, int64_t b, int64_t c);
+void mathRotate(SwPoint& pt, SwFixed angle);
+SwFixed mathTan(SwFixed angle);
+SwFixed mathAtan(const SwPoint& pt);
+SwFixed mathCos(SwFixed angle);
+SwFixed mathSin(SwFixed angle);
+void mathSplitCubic(SwPoint* base);
+void mathSplitLine(SwPoint* base);
+SwFixed mathDiff(SwFixed angle1, SwFixed angle2);
+SwFixed mathLength(const SwPoint& pt);
+int mathCubicAngle(const SwPoint* base, SwFixed& angleIn, SwFixed& angleMid, SwFixed& angleOut);
+SwFixed mathMean(SwFixed angle1, SwFixed angle2);
+SwPoint mathTransform(const Point* to, const Matrix& transform);
+bool mathUpdateOutlineBBox(const SwOutline* outline, const SwBBox& clipRegion, SwBBox& renderRegion, bool fastTrack);
+bool mathClipBBox(const SwBBox& clipper, SwBBox& clippee);
+
+void shapeReset(SwShape* shape);
+bool shapePrepare(SwShape* shape, const RenderShape* rshape, const Matrix& transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid, bool hasComposite);
+bool shapePrepared(const SwShape* shape);
+bool shapeGenRle(SwShape* shape, const RenderShape* rshape, bool antiAlias);
+void shapeDelOutline(SwShape* shape, SwMpool* mpool, uint32_t tid);
+void shapeResetStroke(SwShape* shape, const RenderShape* rshape, const Matrix& transform);
+bool shapeGenStrokeRle(SwShape* shape, const RenderShape* rshape, const Matrix& transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid);
+void shapeFree(SwShape* shape);
+void shapeDelStroke(SwShape* shape);
+bool shapeGenFillColors(SwShape* shape, const Fill* fill, const Matrix& transform, SwSurface* surface, uint8_t opacity, bool ctable);
+bool shapeGenStrokeFillColors(SwShape* shape, const Fill* fill, const Matrix& transform, SwSurface* surface, uint8_t opacity, bool ctable);
+void shapeResetFill(SwShape* shape);
+void shapeResetStrokeFill(SwShape* shape);
+void shapeDelFill(SwShape* shape);
+void shapeDelStrokeFill(SwShape* shape);
+
+void strokeReset(SwStroke* stroke, const RenderShape* shape, const Matrix& transform);
+bool strokeParseOutline(SwStroke* stroke, const SwOutline& outline);
+SwOutline* strokeExportOutline(SwStroke* stroke, SwMpool* mpool, unsigned tid);
+void strokeFree(SwStroke* stroke);
+
+bool imagePrepare(SwImage* image, const Matrix& transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid);
+bool imageGenRle(SwImage* image, const SwBBox& renderRegion, bool antiAlias);
+void imageDelOutline(SwImage* image, SwMpool* mpool, uint32_t tid);
+void imageReset(SwImage* image);
+void imageFree(SwImage* image);
+
+bool fillGenColorTable(SwFill* fill, const Fill* fdata, const Matrix& transform, SwSurface* surface, uint8_t opacity, bool ctable);
+const Fill::ColorStop* fillFetchSolid(const SwFill* fill, const Fill* fdata);
+void fillReset(SwFill* fill);
+void fillFree(SwFill* fill);
+
+//OPTIMIZE_ME: Skip the function pointer access
+void fillLinear(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask maskOp, uint8_t opacity);                                   //composite masking ver.
+void fillLinear(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask maskOp, uint8_t opacity);                     //direct masking ver.
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a);                                         //blending ver.
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a);                          //blending + BlendingMethod(op2) ver.
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity);     //matting ver.
+
+void fillRadial(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask op, uint8_t a);                                             //composite masking ver.
+void fillRadial(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask op, uint8_t a) ;                              //direct masking ver.
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a);                                         //blending ver.
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a);                          //blending + BlendingMethod(op2) ver.
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity);     //matting ver.
+
+SwRle* rleRender(SwRle* rle, const SwOutline* outline, const SwBBox& renderRegion, bool antiAlias);
+SwRle* rleRender(const SwBBox* bbox);
+void rleFree(SwRle* rle);
+void rleReset(SwRle* rle);
+void rleMerge(SwRle* rle, SwRle* clip1, SwRle* clip2);
+bool rleClip(SwRle* rle, const SwRle* clip);
+bool rleClip(SwRle* rle, const SwBBox* clip);
+
+SwMpool* mpoolInit(uint32_t threads);
+bool mpoolTerm(SwMpool* mpool);
+bool mpoolClear(SwMpool* mpool);
+SwOutline* mpoolReqOutline(SwMpool* mpool, unsigned idx);
+void mpoolRetOutline(SwMpool* mpool, unsigned idx);
+SwOutline* mpoolReqStrokeOutline(SwMpool* mpool, unsigned idx);
+void mpoolRetStrokeOutline(SwMpool* mpool, unsigned idx);
+SwOutline* mpoolReqDashOutline(SwMpool* mpool, unsigned idx);
+void mpoolRetDashOutline(SwMpool* mpool, unsigned idx);
+
+bool rasterCompositor(SwSurface* surface);
+bool rasterGradientShape(SwSurface* surface, SwShape* shape, const Fill* fdata, uint8_t opacity);
+bool rasterShape(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a);
+bool rasterImage(SwSurface* surface, SwImage* image, const Matrix& transform, const SwBBox& bbox, uint8_t opacity);
+bool rasterStroke(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a);
+bool rasterGradientStroke(SwSurface* surface, SwShape* shape, const Fill* fdata, uint8_t opacity);
+bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_t h, pixel_t val = 0);
+void rasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len);
+void rasterTranslucentPixel32(uint32_t* dst, uint32_t* src, uint32_t len, uint8_t opacity);
+void rasterPixel32(uint32_t* dst, uint32_t* src, uint32_t len, uint8_t opacity);
+void rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len);
+void rasterXYFlip(uint32_t* src, uint32_t* dst, int32_t stride, int32_t w, int32_t h, const SwBBox& bbox, bool flipped);
+void rasterUnpremultiply(RenderSurface* surface);
+void rasterPremultiply(RenderSurface* surface);
+bool rasterConvertCS(RenderSurface* surface, ColorSpace to);
+uint32_t rasterUnpremultiply(uint32_t data);
+
+bool effectGaussianBlur(SwCompositor* cmp, SwSurface* surface, const RenderEffectGaussianBlur* params);
+bool effectGaussianBlurRegion(RenderEffectGaussianBlur* effect);
+void effectGaussianBlurUpdate(RenderEffectGaussianBlur* effect, const Matrix& transform);
+bool effectDropShadow(SwCompositor* cmp, SwSurface* surfaces[2], const RenderEffectDropShadow* params, bool direct);
+bool effectDropShadowRegion(RenderEffectDropShadow* effect);
+void effectDropShadowUpdate(RenderEffectDropShadow* effect, const Matrix& transform);
+void effectFillUpdate(RenderEffectFill* effect);
+bool effectFill(SwCompositor* cmp, const RenderEffectFill* params, bool direct);
+void effectTintUpdate(RenderEffectTint* effect);
+bool effectTint(SwCompositor* cmp, const RenderEffectTint* params, bool direct);
+void effectTritoneUpdate(RenderEffectTritone* effect);
+bool effectTritone(SwCompositor* cmp, const RenderEffectTritone* params, bool direct);
+
+#endif /* _TVG_SW_COMMON_H_ */
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwFill.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwFill.cpp
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgMath.h"
+#include "tvgSwCommon.h"
+#include "tvgFill.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+#define RADIAL_A_THRESHOLD 0.0005f
+#define GRADIENT_STOP_SIZE 1024
+#define FIXPT_BITS 8
+#define FIXPT_SIZE (1<<FIXPT_BITS)
+
+/*
+ * quadratic equation with the following coefficients (rx and ry defined in the _calculateCoefficients()):
+ * A = a  // fill->radial.a
+ * B = 2 * (dr * fr + rx * dx + ry * dy)
+ * C = fr^2 - rx^2 - ry^2
+ * Derivatives are computed with respect to dx.
+ * This procedure aims to optimize and eliminate the need to calculate all values from the beginning
+ * for consecutive x values with a constant y. The Taylor series expansions are computed as long as
+ * its terms are non-zero.
+ */
+static void _calculateCoefficients(const SwFill* fill, uint32_t x, uint32_t y, float& b, float& deltaB, float& det, float& deltaDet, float& deltaDeltaDet)
+{
+    auto radial = &fill->radial;
+
+    auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+    auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+
+    b = (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy) * radial->invA;
+    deltaB = (radial->a11 * radial->dx + radial->a21 * radial->dy) * radial->invA;
+
+    auto rr = rx * rx + ry * ry;
+    auto deltaRr = 2.0f * (rx * radial->a11 + ry * radial->a21) * radial->invA;
+    auto deltaDeltaRr = 2.0f * (radial->a11 * radial->a11 + radial->a21 * radial->a21) * radial->invA;
+
+    det = b * b + (rr - radial->fr * radial->fr) * radial->invA;
+    deltaDet = 2.0f * b * deltaB + deltaB * deltaB + deltaRr + deltaDeltaRr * 0.5f;
+    deltaDeltaDet = 2.0f * deltaB * deltaB + deltaDeltaRr;
+}
+
+
+static uint32_t _estimateAAMargin(const Fill* fdata)
+{
+    constexpr float marginScalingFactor = 800.0f;
+    if (fdata->type() == Type::RadialGradient) {
+        auto radius = P(static_cast<const RadialGradient*>(fdata))->r;
+        return tvg::zero(radius) ? 0 : static_cast<uint32_t>(marginScalingFactor / radius);
+    }
+    auto grad = P(static_cast<const LinearGradient*>(fdata));
+    Point p1 {grad->x1, grad->y1};
+    Point p2 {grad->x2, grad->y2};
+    auto len = length(&p1, &p2);
+    return tvg::zero(len) ? 0 : static_cast<uint32_t>(marginScalingFactor / len);
+}
+
+
+static void _adjustAAMargin(uint32_t& iMargin, uint32_t index)
+{
+    constexpr float threshold = 0.1f;
+    constexpr uint32_t iMarginMax = 40;
+
+    auto iThreshold = static_cast<uint32_t>(index * threshold);
+    if (iMargin > iThreshold) iMargin = iThreshold;
+    if (iMargin > iMarginMax) iMargin = iMarginMax;
+}
+
+
+static inline uint32_t _alphaUnblend(uint32_t c)
+{
+    auto a = (c >> 24);
+    if (a == 255 || a == 0) return c;
+    auto invA = 255.0f / static_cast<float>(a);
+    auto c0 = static_cast<uint8_t>(static_cast<float>((c >> 16) & 0xFF) * invA);
+    auto c1 = static_cast<uint8_t>(static_cast<float>((c >> 8) & 0xFF) * invA);
+    auto c2 = static_cast<uint8_t>(static_cast<float>(c & 0xFF) * invA);
+
+    return (a << 24) | (c0 << 16) | (c1 << 8) | c2;
+}
+
+
+static void _applyAA(const SwFill* fill, uint32_t begin, uint32_t end)
+{
+    if (begin == 0 || end == 0) return;
+
+    auto i = GRADIENT_STOP_SIZE - end;
+    auto rgbaEnd = _alphaUnblend(fill->ctable[i]);
+    auto rgbaBegin = _alphaUnblend(fill->ctable[begin]);
+
+    auto dt = 1.0f / (begin + end + 1.0f);
+    float t = dt;
+    while (i != begin) {
+        auto dist = 255 - static_cast<int32_t>(255 * t);
+        auto color = INTERPOLATE(rgbaEnd, rgbaBegin, dist);
+        fill->ctable[i++] = ALPHA_BLEND((color | 0xff000000), (color >> 24));
+
+        if (i == GRADIENT_STOP_SIZE) i = 0;
+        t += dt;
+    }
+}
+
+
+static bool _updateColorTable(SwFill* fill, const Fill* fdata, const SwSurface* surface, uint8_t opacity)
+{
+    if (fill->solid) return true;
+
+    if (!fill->ctable) {
+        fill->ctable = static_cast<uint32_t*>(malloc(GRADIENT_STOP_SIZE * sizeof(uint32_t)));
+        if (!fill->ctable) return false;
+    }
+
+    const Fill::ColorStop* colors;
+    auto cnt = fdata->colorStops(&colors);
+    if (cnt == 0 || !colors) return false;
+
+    auto pColors = colors;
+
+    auto a = MULTIPLY(pColors->a, opacity);
+    if (a < 255) fill->translucent = true;
+
+    auto r = pColors->r;
+    auto g = pColors->g;
+    auto b = pColors->b;
+    auto rgba = surface->join(r, g, b, a);
+
+    auto inc = 1.0f / static_cast<float>(GRADIENT_STOP_SIZE);
+    auto pos = 1.5f * inc;
+    uint32_t i = 0;
+
+    //If repeat is true, anti-aliasing must be applied between the last and the first colors.
+    auto repeat = fill->spread == FillSpread::Repeat;
+    uint32_t iAABegin = repeat ? _estimateAAMargin(fdata) : 0;
+    uint32_t iAAEnd = 0;
+
+    fill->ctable[i++] = ALPHA_BLEND(rgba | 0xff000000, a);
+
+    while (pos <= pColors->offset) {
+        fill->ctable[i] = fill->ctable[i - 1];
+        ++i;
+        pos += inc;
+    }
+
+    for (uint32_t j = 0; j < cnt - 1; ++j) {
+        if (repeat && j == cnt - 2 && iAAEnd == 0) {
+            iAAEnd = iAABegin;
+            _adjustAAMargin(iAAEnd, GRADIENT_STOP_SIZE - i);
+        }
+
+        auto curr = colors + j;
+        auto next = curr + 1;
+        auto delta = 1.0f / (next->offset - curr->offset);
+        auto a2 = MULTIPLY(next->a, opacity);
+        if (!fill->translucent && a2 < 255) fill->translucent = true;
+
+        auto rgba2 = surface->join(next->r, next->g, next->b, a2);
+
+        while (pos < next->offset && i < GRADIENT_STOP_SIZE) {
+            auto t = (pos - curr->offset) * delta;
+            auto dist = static_cast<int32_t>(255 * t);
+            auto dist2 = 255 - dist;
+
+            auto color = INTERPOLATE(rgba, rgba2, dist2);
+            fill->ctable[i] = ALPHA_BLEND((color | 0xff000000), (color >> 24));
+
+            ++i;
+            pos += inc;
+        }
+        rgba = rgba2;
+        a = a2;
+
+        if (repeat && j == 0) _adjustAAMargin(iAABegin, i - 1);
+    }
+    rgba = ALPHA_BLEND((rgba | 0xff000000), a);
+
+    for (; i < GRADIENT_STOP_SIZE; ++i)
+        fill->ctable[i] = rgba;
+
+    //For repeat fill spread apply anti-aliasing between the last and first colors,
+    //othewise make sure the last color stop is represented at the end of the table.
+    if (repeat) _applyAA(fill, iAABegin, iAAEnd);
+    else fill->ctable[GRADIENT_STOP_SIZE - 1] = rgba;
+
+    return true;
+}
+
+
+bool _prepareLinear(SwFill* fill, const LinearGradient* linear, const Matrix& transform)
+{
+    float x1, x2, y1, y2;
+    if (linear->linear(&x1, &y1, &x2, &y2) != Result::Success) return false;
+
+    fill->linear.dx = x2 - x1;
+    fill->linear.dy = y2 - y1;
+    auto len = fill->linear.dx * fill->linear.dx + fill->linear.dy * fill->linear.dy;
+
+    if (len < FLOAT_EPSILON) {
+        if (tvg::zero(fill->linear.dx) && tvg::zero(fill->linear.dy)) {
+            fill->solid = true;
+        }
+        return true;
+    }
+
+    fill->linear.dx /= len;
+    fill->linear.dy /= len;
+    fill->linear.offset = -fill->linear.dx * x1 - fill->linear.dy * y1;
+
+    auto gradTransform = linear->transform();
+    bool isTransformation = !identity((const Matrix*)(&gradTransform));
+
+    if (isTransformation) {
+        gradTransform = transform * gradTransform;
+    } else {
+        gradTransform = transform;
+        isTransformation = true;
+    }
+
+    if (isTransformation) {
+        Matrix invTransform;
+        if (!inverse(&gradTransform, &invTransform)) return false;
+
+        fill->linear.offset += fill->linear.dx * invTransform.e13 + fill->linear.dy * invTransform.e23;
+
+        auto dx = fill->linear.dx;
+        fill->linear.dx = dx * invTransform.e11 + fill->linear.dy * invTransform.e21;
+        fill->linear.dy = dx * invTransform.e12 + fill->linear.dy * invTransform.e22;
+    }
+
+    return true;
+}
+
+
+bool _prepareRadial(SwFill* fill, const RadialGradient* radial, const Matrix& transform)
+{
+    auto cx = P(radial)->cx;
+    auto cy = P(radial)->cy;
+    auto r = P(radial)->r;
+    auto fx = P(radial)->fx;
+    auto fy = P(radial)->fy;
+    auto fr = P(radial)->fr;
+
+    if (tvg::zero(r)) {
+        fill->solid = true;
+        return true;
+    }
+
+    fill->radial.dr = r - fr;
+    fill->radial.dx = cx - fx;
+    fill->radial.dy = cy - fy;
+    fill->radial.fr = fr;
+    fill->radial.fx = fx;
+    fill->radial.fy = fy;
+    fill->radial.a = fill->radial.dr * fill->radial.dr - fill->radial.dx * fill->radial.dx - fill->radial.dy * fill->radial.dy;
+
+    //This condition fulfills the SVG 1.1 std:
+    //the focal point, if outside the end circle, is moved to be on the end circle
+    //See: the SVG 2 std requirements: https://www.w3.org/TR/SVG2/pservers.html#RadialGradientNotes
+    if (fill->radial.a < 0) {
+        auto dist = sqrtf(fill->radial.dx * fill->radial.dx + fill->radial.dy * fill->radial.dy);
+        fill->radial.fx = cx + r * (fx - cx) / dist;
+        fill->radial.fy = cy + r * (fy - cy) / dist;
+        fill->radial.dx = cx - fill->radial.fx;
+        fill->radial.dy = cy - fill->radial.fy;
+        // Prevent loss of precision on Apple Silicon when dr=dy and dx=0 due to FMA
+        // https://github.com/thorvg/thorvg/issues/2014
+        auto dr2 = fill->radial.dr * fill->radial.dr;
+        auto dx2 = fill->radial.dx * fill->radial.dx;
+        auto dy2 = fill->radial.dy * fill->radial.dy;
+
+        fill->radial.a = dr2 - dx2 - dy2;
+    }
+
+    if (fill->radial.a > 0) fill->radial.invA = 1.0f / fill->radial.a;
+
+    auto gradTransform = radial->transform();
+    bool isTransformation = !identity((const Matrix*)(&gradTransform));
+
+    if (isTransformation) gradTransform = transform * gradTransform;
+    else {
+        gradTransform = transform;
+        isTransformation = true;
+    }
+
+    if (isTransformation) {
+        Matrix invTransform;
+        if (!inverse(&gradTransform, &invTransform)) return false;
+        fill->radial.a11 = invTransform.e11;
+        fill->radial.a12 = invTransform.e12;
+        fill->radial.a13 = invTransform.e13;
+        fill->radial.a21 = invTransform.e21;
+        fill->radial.a22 = invTransform.e22;
+        fill->radial.a23 = invTransform.e23;
+    } else {
+        fill->radial.a11 = fill->radial.a22 = 1.0f;
+        fill->radial.a12 = fill->radial.a13 = 0.0f;
+        fill->radial.a21 = fill->radial.a23 = 0.0f;
+    }
+    return true;
+}
+
+
+static inline uint32_t _clamp(const SwFill* fill, int32_t pos)
+{
+    switch (fill->spread) {
+        case FillSpread::Pad: {
+            if (pos >= GRADIENT_STOP_SIZE) pos = GRADIENT_STOP_SIZE - 1;
+            else if (pos < 0) pos = 0;
+            break;
+        }
+        case FillSpread::Repeat: {
+            pos = pos % GRADIENT_STOP_SIZE;
+            if (pos < 0) pos = GRADIENT_STOP_SIZE + pos;
+            break;
+        }
+        case FillSpread::Reflect: {
+            auto limit = GRADIENT_STOP_SIZE * 2;
+            pos = pos % limit;
+            if (pos < 0) pos = limit + pos;
+            if (pos >= GRADIENT_STOP_SIZE) pos = (limit - pos - 1);
+            break;
+        }
+    }
+    return pos;
+}
+
+
+static inline uint32_t _fixedPixel(const SwFill* fill, int32_t pos)
+{
+    int32_t i = (pos + (FIXPT_SIZE / 2)) >> FIXPT_BITS;
+    return fill->ctable[_clamp(fill, i)];
+}
+
+
+static inline uint32_t _pixel(const SwFill* fill, float pos)
+{
+    auto i = static_cast<int32_t>(pos * (GRADIENT_STOP_SIZE - 1) + 0.5f);
+    return fill->ctable[_clamp(fill, i)];
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+{
+    //edge case
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+
+        if (opacity == 255) {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+                auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+                *dst = opBlendNormal(_pixel(fill, x0), *dst, alpha(cmp));
+                rx += radial->a11;
+                ry += radial->a21;
+            }
+        } else {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+                auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+                *dst = opBlendNormal(_pixel(fill, x0), *dst, MULTIPLY(opacity, alpha(cmp)));
+                rx += radial->a11;
+                ry += radial->a21;
+            }
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+
+        if (opacity == 255) {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+                *dst = opBlendNormal(_pixel(fill, sqrtf(det) - b), *dst, alpha(cmp));
+                det += deltaDet;
+                deltaDet += deltaDeltaDet;
+                b += deltaB;
+            }
+        } else {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+                *dst = opBlendNormal(_pixel(fill, sqrtf(det) - b), *dst, MULTIPLY(opacity, alpha(cmp)));
+                det += deltaDet;
+                deltaDet += deltaDeltaDet;
+                b += deltaB;
+            }
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a)
+{
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+        for (uint32_t i = 0; i < len; ++i, ++dst) {
+            auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+            *dst = op(_pixel(fill, x0), *dst, a);
+            rx += radial->a11;
+            ry += radial->a21;
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+
+        for (uint32_t i = 0; i < len; ++i, ++dst) {
+            *dst = op(_pixel(fill, sqrtf(det) - b), *dst, a);
+            det += deltaDet;
+            deltaDet += deltaDeltaDet;
+            b += deltaB;
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask maskOp, uint8_t a)
+{
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+            auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+            auto src = MULTIPLY(a, A(_pixel(fill, x0)));
+            *dst = maskOp(src, *dst, ~src);
+            rx += radial->a11;
+            ry += radial->a21;
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+            auto src = MULTIPLY(a, A(_pixel(fill, sqrtf(det) - b)));
+            *dst = maskOp(src, *dst, ~src);
+            det += deltaDet;
+            deltaDet += deltaDeltaDet;
+            b += deltaB;
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask maskOp, uint8_t a)
+{
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst, ++cmp) {
+            auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+            auto src = MULTIPLY(A(A(_pixel(fill, x0))), a);
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+            rx += radial->a11;
+            ry += radial->a21;
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst, ++cmp) {
+            auto src = MULTIPLY(A(_pixel(fill, sqrtf(det))), a);
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+            det += deltaDet;
+            deltaDet += deltaDeltaDet;
+            b += deltaB;
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a)
+{
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+
+        if (a == 255) {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+                auto tmp = op(_pixel(fill, x0), *dst, 255);
+                *dst = op2(tmp, *dst, 255);
+                rx += radial->a11;
+                ry += radial->a21;
+            }
+        } else {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+                auto tmp = op(_pixel(fill, x0), *dst, 255);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+                rx += radial->a11;
+                ry += radial->a21;
+            }
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+        if (a == 255) {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+                auto tmp = op(_pixel(fill, sqrtf(det) - b), *dst, 255);
+                *dst = op2(tmp, *dst, 255);
+                det += deltaDet;
+                deltaDet += deltaDeltaDet;
+                b += deltaB;
+            }
+        } else {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+                auto tmp = op(_pixel(fill, sqrtf(det) - b), *dst, 255);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+                det += deltaDet;
+                deltaDet += deltaDeltaDet;
+                b += deltaB;
+            }
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (opacity == 255) {
+        if (tvg::zero(inc)) {
+            auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+            for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) {
+                *dst = opBlendNormal(color, *dst, alpha(cmp));
+            }
+            return;
+        }
+
+        auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+        auto vMin = -vMax;
+        auto v = t + (inc * len);
+
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) {
+                *dst = opBlendNormal(_fixedPixel(fill, t2), *dst, alpha(cmp));
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = opBlendNormal(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, alpha(cmp));
+                ++dst;
+                t += inc;
+                cmp += csize;
+            }
+        }
+    } else {
+        if (tvg::zero(inc)) {
+            auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+            for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) {
+                *dst = opBlendNormal(color, *dst, MULTIPLY(alpha(cmp), opacity));
+            }
+            return;
+        }
+
+        auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+        auto vMin = -vMax;
+        auto v = t + (inc * len);
+
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) {
+                *dst = opBlendNormal(_fixedPixel(fill, t2), *dst, MULTIPLY(alpha(cmp), opacity));
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = opBlendNormal(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, MULTIPLY(opacity, alpha(cmp)));
+                ++dst;
+                t += inc;
+                cmp += csize;
+            }
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask maskOp, uint8_t a)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (tvg::zero(inc)) {
+        auto src = MULTIPLY(a, A(_fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE))));
+        for (uint32_t i = 0; i < len; ++i, ++dst) {
+            *dst = maskOp(src, *dst, ~src);
+        }
+        return;
+    }
+
+    auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+    auto vMin = -vMax;
+    auto v = t + (inc * len);
+
+    //we can use fixed point math
+    if (v < vMax && v > vMin) {
+        auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+        auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+        for (uint32_t j = 0; j < len; ++j, ++dst) {
+            auto src = MULTIPLY(A(_fixedPixel(fill, t2)), a);
+            *dst = maskOp(src, *dst, ~src);
+            t2 += inc2;
+        }
+    //we have to fallback to float math
+    } else {
+        uint32_t counter = 0;
+        while (counter++ < len) {
+            auto src = MULTIPLY(A(_pixel(fill, t / GRADIENT_STOP_SIZE)), a);
+            *dst = maskOp(src, *dst, ~src);
+            ++dst;
+            t += inc;
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask maskOp, uint8_t a)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (tvg::zero(inc)) {
+        auto src = A(_fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE)));
+        src = MULTIPLY(src, a);
+        for (uint32_t i = 0; i < len; ++i, ++dst, ++cmp) {
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+        }
+        return;
+    }
+
+    auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+    auto vMin = -vMax;
+    auto v = t + (inc * len);
+
+    //we can use fixed point math
+    if (v < vMax && v > vMin) {
+        auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+        auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+        for (uint32_t j = 0; j < len; ++j, ++dst, ++cmp) {
+            auto src = MULTIPLY(a, A(_fixedPixel(fill, t2)));
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+            t2 += inc2;
+        }
+    //we have to fallback to float math
+    } else {
+        uint32_t counter = 0;
+        while (counter++ < len) {
+            auto src = MULTIPLY(A(_pixel(fill, t / GRADIENT_STOP_SIZE)), a);
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+            ++dst;
+            ++cmp;
+            t += inc;
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (tvg::zero(inc)) {
+        auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+        for (uint32_t i = 0; i < len; ++i, ++dst) {
+            *dst = op(color, *dst, a);
+        }
+        return;
+    }
+
+    auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+    auto vMin = -vMax;
+    auto v = t + (inc * len);
+
+    //we can use fixed point math
+    if (v < vMax && v > vMin) {
+        auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+        auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+        for (uint32_t j = 0; j < len; ++j, ++dst) {
+            *dst = op(_fixedPixel(fill, t2), *dst, a);
+            t2 += inc2;
+        }
+    //we have to fallback to float math
+    } else {
+        uint32_t counter = 0;
+        while (counter++ < len) {
+            *dst = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, a);
+            ++dst;
+            t += inc;
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (tvg::zero(inc)) {
+        auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+        if (a == 255) {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                auto tmp = op(color, *dst, a);
+                *dst = op2(tmp, *dst, 255);
+            }
+        } else {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                auto tmp = op(color, *dst, a);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+            }
+        }
+        return;
+    }
+
+    auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+    auto vMin = -vMax;
+    auto v = t + (inc * len);
+
+    if (a == 255) {
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst) {
+                auto tmp = op(_fixedPixel(fill, t2), *dst, 255);
+                *dst = op2(tmp, *dst, 255);
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                auto tmp = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, 255);
+                *dst = op2(tmp, *dst, 255);
+                ++dst;
+                t += inc;
+            }
+        }
+    } else {
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst) {
+                auto tmp = op(_fixedPixel(fill, t2), *dst, 255);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                auto tmp = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, 255);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+                ++dst;
+                t += inc;
+            }
+        }
+    }
+}
+
+
+bool fillGenColorTable(SwFill* fill, const Fill* fdata, const Matrix& transform, SwSurface* surface, uint8_t opacity, bool ctable)
+{
+    if (!fill) return false;
+
+    fill->spread = fdata->spread();
+
+    if (fdata->type() == Type::LinearGradient) {
+        if (!_prepareLinear(fill, static_cast<const LinearGradient*>(fdata), transform)) return false;
+    } else if (fdata->type() == Type::RadialGradient) {
+        if (!_prepareRadial(fill, static_cast<const RadialGradient*>(fdata), transform)) return false;
+    }
+
+    if (ctable) return _updateColorTable(fill, fdata, surface, opacity);
+    return true;
+}
+
+
+const Fill::ColorStop* fillFetchSolid(const SwFill* fill, const Fill* fdata)
+{
+    if (!fill->solid) return nullptr;
+
+    const Fill::ColorStop* colors;
+    auto cnt = fdata->colorStops(&colors);
+    if (cnt == 0 || !colors) return nullptr;
+
+    return colors + cnt - 1;
+}
+
+
+void fillReset(SwFill* fill)
+{
+    if (fill->ctable) {
+        free(fill->ctable);
+        fill->ctable = nullptr;
+    }
+    fill->translucent = false;
+    fill->solid = false;
+}
+
+
+void fillFree(SwFill* fill)
+{
+    if (!fill) return;
+
+    if (fill->ctable) free(fill->ctable);
+
+    free(fill);
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwImage.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwImage.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgMath.h"
+#include "tvgSwCommon.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+static inline bool _onlyShifted(const Matrix& m)
+{
+    if (tvg::equal(m.e11, 1.0f) && tvg::equal(m.e22, 1.0f) && tvg::zero(m.e12) && tvg::zero(m.e21)) return true;
+    return false;
+}
+
+
+static bool _genOutline(SwImage* image, const Matrix& transform, SwMpool* mpool, unsigned tid)
+{
+    image->outline = mpoolReqOutline(mpool, tid);
+    auto outline = image->outline;
+
+    outline->pts.reserve(5);
+    outline->types.reserve(5);
+    outline->cntrs.reserve(1);
+    outline->closed.reserve(1);
+
+    Point to[4];
+    auto w = static_cast<float>(image->w);
+    auto h = static_cast<float>(image->h);
+    to[0] = {0, 0};
+    to[1] = {w, 0};
+    to[2] = {w, h};
+    to[3] = {0, h};
+
+    for (int i = 0; i < 4; i++) {
+        outline->pts.push(mathTransform(&to[i], transform));
+        outline->types.push(SW_CURVE_TYPE_POINT);
+    }
+
+    outline->pts.push(outline->pts[0]);
+    outline->types.push(SW_CURVE_TYPE_POINT);
+    outline->cntrs.push(outline->pts.count - 1);
+    outline->closed.push(true);
+
+    image->outline = outline;
+
+    return true;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+bool imagePrepare(SwImage* image, const Matrix& transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid)
+{
+    image->direct = _onlyShifted(transform);
+
+    //Fast track: Non-transformed image but just shifted.
+    if (image->direct) {
+        image->ox = -static_cast<int32_t>(nearbyint(transform.e13));
+        image->oy = -static_cast<int32_t>(nearbyint(transform.e23));
+    //Figure out the scale factor by transform matrix
+    } else {
+        auto scaleX = sqrtf((transform.e11 * transform.e11) + (transform.e21 * transform.e21));
+        auto scaleY = sqrtf((transform.e22 * transform.e22) + (transform.e12 * transform.e12));
+        image->scale = (fabsf(scaleX - scaleY) > 0.01f) ? 1.0f : scaleX;
+
+        if (tvg::zero(transform.e12) && tvg::zero(transform.e21)) image->scaled = true;
+        else image->scaled = false;
+    }
+
+    if (!_genOutline(image, transform, mpool, tid)) return false;
+    return mathUpdateOutlineBBox(image->outline, clipRegion, renderRegion, image->direct);
+}
+
+
+bool imageGenRle(SwImage* image, const SwBBox& renderRegion, bool antiAlias)
+{
+    if ((image->rle = rleRender(image->rle, image->outline, renderRegion, antiAlias))) return true;
+
+    return false;
+}
+
+
+void imageDelOutline(SwImage* image, SwMpool* mpool, uint32_t tid)
+{
+    mpoolRetOutline(mpool, tid);
+    image->outline = nullptr;
+}
+
+
+void imageReset(SwImage* image)
+{
+    rleReset(image->rle);
+}
+
+
+void imageFree(SwImage* image)
+{
+    rleFree(image->rle);
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMath.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMath.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgMath.h"
+#include "tvgSwCommon.h"
+
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+static float TO_RADIAN(SwFixed angle)
+{
+    return (float(angle) / 65536.0f) * (MATH_PI / 180.0f);
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+SwFixed mathMean(SwFixed angle1, SwFixed angle2)
+{
+    return angle1 + mathDiff(angle1, angle2) / 2;
+}
+
+
+int mathCubicAngle(const SwPoint* base, SwFixed& angleIn, SwFixed& angleMid, SwFixed& angleOut)
+{
+    auto d1 = base[2] - base[3];
+    auto d2 = base[1] - base[2];
+    auto d3 = base[0] - base[1];
+
+    if (d1.small()) {
+        if (d2.small()) {
+            if (d3.small()) {
+                angleIn = angleMid = angleOut = 0;
+                return -1;  //ignoreable
+            } else {
+                angleIn = angleMid = angleOut = mathAtan(d3);
+            }
+        } else {
+            if (d3.small()) {
+                angleIn = angleMid = angleOut = mathAtan(d2);
+            } else {
+                angleIn = angleMid = mathAtan(d2);
+                angleOut = mathAtan(d3);
+            }
+        }
+    } else {
+        if (d2.small()) {
+            if (d3.small()) {
+                angleIn = angleMid = angleOut = mathAtan(d1);
+            } else {
+                angleIn = mathAtan(d1);
+                angleOut = mathAtan(d3);
+                angleMid = mathMean(angleIn, angleOut);
+            }
+        } else {
+            if (d3.small()) {
+                angleIn = mathAtan(d1);
+                angleMid = angleOut = mathAtan(d2);
+            } else {
+                angleIn = mathAtan(d1);
+                angleMid = mathAtan(d2);
+                angleOut = mathAtan(d3);
+            }
+        }
+    }
+
+    auto theta1 = abs(mathDiff(angleIn, angleMid));
+    auto theta2 = abs(mathDiff(angleMid, angleOut));
+
+    if ((theta1 < (SW_ANGLE_PI / 8)) && (theta2 < (SW_ANGLE_PI / 8))) return 0; //small size
+    return 1;
+}
+
+
+int64_t mathMultiply(int64_t a, int64_t b)
+{
+    int32_t s = 1;
+
+    //move sign
+    if (a < 0) {
+        a = -a;
+        s = -s;
+    }
+    if (b < 0) {
+        b = -b;
+        s = -s;
+    }
+    int64_t c = (a * b + 0x8000L) >> 16;
+    return (s > 0) ? c : -c;
+}
+
+
+int64_t mathDivide(int64_t a, int64_t b)
+{
+    int32_t s = 1;
+
+    //move sign
+    if (a < 0) {
+        a = -a;
+        s = -s;
+    }
+    if (b < 0) {
+        b = -b;
+        s = -s;
+    }
+    int64_t q = b > 0 ? ((a << 16) + (b >> 1)) / b : 0x7FFFFFFFL;
+    return (s < 0 ? -q : q);
+}
+
+
+int64_t mathMulDiv(int64_t a, int64_t b, int64_t c)
+{
+    int32_t s = 1;
+
+    //move sign
+    if (a < 0) {
+        a = -a;
+        s = -s;
+    }
+    if (b < 0) {
+        b = -b;
+        s = -s;
+    }
+    if (c < 0) {
+        c = -c;
+        s = -s;
+    }
+    int64_t d = c > 0 ? (a * b + (c >> 1)) / c : 0x7FFFFFFFL;
+
+    return (s > 0 ? d : -d);
+}
+
+
+void mathRotate(SwPoint& pt, SwFixed angle)
+{
+    if (angle == 0 || pt.zero()) return;
+
+    Point v = pt.toPoint();
+
+    auto radian = TO_RADIAN(angle);
+    auto cosv = cosf(radian);
+    auto sinv = sinf(radian);
+
+    pt.x = SwCoord(nearbyint((v.x * cosv - v.y * sinv) * 64.0f));
+    pt.y = SwCoord(nearbyint((v.x * sinv + v.y * cosv) * 64.0f));
+}
+
+
+SwFixed mathTan(SwFixed angle)
+{
+    if (angle == 0) return 0;
+    return SwFixed(tanf(TO_RADIAN(angle)) * 65536.0f);
+}
+
+
+SwFixed mathAtan(const SwPoint& pt)
+{
+    if (pt.zero()) return 0;
+    return SwFixed(tvg::atan2(TO_FLOAT(pt.y), TO_FLOAT(pt.x)) * (180.0f / MATH_PI) * 65536.0f);
+}
+
+
+SwFixed mathSin(SwFixed angle)
+{
+    if (angle == 0) return 0;
+    return mathCos(SW_ANGLE_PI2 - angle);
+}
+
+
+SwFixed mathCos(SwFixed angle)
+{
+    return SwFixed(cosf(TO_RADIAN(angle)) * 65536.0f);
+}
+
+
+SwFixed mathLength(const SwPoint& pt)
+{
+    if (pt.zero()) return 0;
+
+    //trivial case
+    if (pt.x == 0) return abs(pt.y);
+    if (pt.y == 0) return abs(pt.x);
+
+    auto v = pt.toPoint();
+    //return static_cast<SwFixed>(sqrtf(v.x * v.x + v.y * v.y) * 65536.0f);
+
+    /* approximate sqrt(x*x + y*y) using alpha max plus beta min algorithm.
+       With alpha = 1, beta = 3/8, giving results with the largest error less
+       than 7% compared to the exact value. */
+    if (v.x < 0) v.x = -v.x;
+    if (v.y < 0) v.y = -v.y;
+    return static_cast<SwFixed>((v.x > v.y) ? (v.x + v.y * 0.375f) : (v.y + v.x * 0.375f));
+}
+
+
+void mathSplitCubic(SwPoint* base)
+{
+    SwCoord a, b, c, d;
+
+    base[6].x = base[3].x;
+    c = base[1].x;
+    d = base[2].x;
+    base[1].x = a = (base[0].x + c) >> 1;
+    base[5].x = b = (base[3].x + d) >> 1;
+    c = (c + d) >> 1;
+    base[2].x = a = (a + c) >> 1;
+    base[4].x = b = (b + c) >> 1;
+    base[3].x = (a + b) >> 1;
+
+    base[6].y = base[3].y;
+    c = base[1].y;
+    d = base[2].y;
+    base[1].y = a = (base[0].y + c) >> 1;
+    base[5].y = b = (base[3].y + d) >> 1;
+    c = (c + d) >> 1;
+    base[2].y = a = (a + c) >> 1;
+    base[4].y = b = (b + c) >> 1;
+    base[3].y = (a + b) >> 1;
+}
+
+
+void mathSplitLine(SwPoint* base)
+{
+    base[2] = base[1];
+
+    base[1].x = (base[0].x + base[1].x) >> 1;
+    base[1].y = (base[0].y + base[1].y) >> 1;
+}
+
+
+SwFixed mathDiff(SwFixed angle1, SwFixed angle2)
+{
+    auto delta = angle2 - angle1;
+
+    delta %= SW_ANGLE_2PI;
+    if (delta < 0) delta += SW_ANGLE_2PI;
+    if (delta > SW_ANGLE_PI) delta -= SW_ANGLE_2PI;
+
+    return delta;
+}
+
+
+SwPoint mathTransform(const Point* to, const Matrix& transform)
+{
+    auto tx = to->x * transform.e11 + to->y * transform.e12 + transform.e13;
+    auto ty = to->x * transform.e21 + to->y * transform.e22 + transform.e23;
+
+    return {TO_SWCOORD(tx), TO_SWCOORD(ty)};
+}
+
+
+bool mathClipBBox(const SwBBox& clipper, SwBBox& clippee)
+{
+    clippee.max.x = (clippee.max.x < clipper.max.x) ? clippee.max.x : clipper.max.x;
+    clippee.max.y = (clippee.max.y < clipper.max.y) ? clippee.max.y : clipper.max.y;
+    clippee.min.x = (clippee.min.x > clipper.min.x) ? clippee.min.x : clipper.min.x;
+    clippee.min.y = (clippee.min.y > clipper.min.y) ? clippee.min.y : clipper.min.y;
+
+    //Check valid region
+    if (clippee.max.x - clippee.min.x < 1 && clippee.max.y - clippee.min.y < 1) return false;
+
+    //Check boundary
+    if (clippee.min.x >= clipper.max.x || clippee.min.y >= clipper.max.y ||
+        clippee.max.x <= clipper.min.x || clippee.max.y <= clipper.min.y) return false;
+
+    return true;
+}
+
+
+bool mathUpdateOutlineBBox(const SwOutline* outline, const SwBBox& clipRegion, SwBBox& renderRegion, bool fastTrack)
+{
+    if (!outline) return false;
+
+    if (outline->pts.empty() || outline->cntrs.empty()) {
+        renderRegion.reset();
+        return false;
+    }
+
+    auto pt = outline->pts.begin();
+
+    auto xMin = pt->x;
+    auto xMax = pt->x;
+    auto yMin = pt->y;
+    auto yMax = pt->y;
+
+    for (++pt; pt < outline->pts.end(); ++pt) {
+        if (xMin > pt->x) xMin = pt->x;
+        if (xMax < pt->x) xMax = pt->x;
+        if (yMin > pt->y) yMin = pt->y;
+        if (yMax < pt->y) yMax = pt->y;
+    }
+
+    if (fastTrack) {
+        renderRegion.min.x = static_cast<SwCoord>(round(xMin / 64.0f));
+        renderRegion.max.x = static_cast<SwCoord>(round(xMax / 64.0f));
+        renderRegion.min.y = static_cast<SwCoord>(round(yMin / 64.0f));
+        renderRegion.max.y = static_cast<SwCoord>(round(yMax / 64.0f));
+    } else {
+        renderRegion.min.x = xMin >> 6;
+        renderRegion.max.x = (xMax + 63) >> 6;
+        renderRegion.min.y = yMin >> 6;
+        renderRegion.max.y = (yMax + 63) >> 6;
+    }
+    return mathClipBBox(clipRegion, renderRegion);
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMemPool.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMemPool.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgSwCommon.h"
+
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+SwOutline* mpoolReqOutline(SwMpool* mpool, unsigned idx)
+{
+    return &mpool->outline[idx];
+}
+
+
+void mpoolRetOutline(SwMpool* mpool, unsigned idx)
+{
+    mpool->outline[idx].pts.clear();
+    mpool->outline[idx].cntrs.clear();
+    mpool->outline[idx].types.clear();
+    mpool->outline[idx].closed.clear();
+}
+
+
+SwOutline* mpoolReqStrokeOutline(SwMpool* mpool, unsigned idx)
+{
+    return &mpool->strokeOutline[idx];
+}
+
+
+void mpoolRetStrokeOutline(SwMpool* mpool, unsigned idx)
+{
+    mpool->strokeOutline[idx].pts.clear();
+    mpool->strokeOutline[idx].cntrs.clear();
+    mpool->strokeOutline[idx].types.clear();
+    mpool->strokeOutline[idx].closed.clear();
+}
+
+
+SwOutline* mpoolReqDashOutline(SwMpool* mpool, unsigned idx)
+{
+    return &mpool->dashOutline[idx];
+}
+
+
+void mpoolRetDashOutline(SwMpool* mpool, unsigned idx)
+{
+    mpool->dashOutline[idx].pts.clear();
+    mpool->dashOutline[idx].cntrs.clear();
+    mpool->dashOutline[idx].types.clear();
+    mpool->dashOutline[idx].closed.clear();
+}
+
+
+SwMpool* mpoolInit(uint32_t threads)
+{
+    auto allocSize = threads + 1;
+
+    auto mpool = static_cast<SwMpool*>(calloc(1, sizeof(SwMpool)));
+    mpool->outline = static_cast<SwOutline*>(calloc(1, sizeof(SwOutline) * allocSize));
+    mpool->strokeOutline = static_cast<SwOutline*>(calloc(1, sizeof(SwOutline) * allocSize));
+    mpool->dashOutline = static_cast<SwOutline*>(calloc(1, sizeof(SwOutline) * allocSize));
+    mpool->allocSize = allocSize;
+
+    return mpool;
+}
+
+
+bool mpoolClear(SwMpool* mpool)
+{
+    for (unsigned i = 0; i < mpool->allocSize; ++i) {
+        mpool->outline[i].pts.reset();
+        mpool->outline[i].cntrs.reset();
+        mpool->outline[i].types.reset();
+        mpool->outline[i].closed.reset();
+
+        mpool->strokeOutline[i].pts.reset();
+        mpool->strokeOutline[i].cntrs.reset();
+        mpool->strokeOutline[i].types.reset();
+        mpool->strokeOutline[i].closed.reset();
+
+        mpool->dashOutline[i].pts.reset();
+        mpool->dashOutline[i].cntrs.reset();
+        mpool->dashOutline[i].types.reset();
+        mpool->dashOutline[i].closed.reset();
+    }
+
+    return true;
+}
+
+
+bool mpoolTerm(SwMpool* mpool)
+{
+    if (!mpool) return false;
+
+    mpoolClear(mpool);
+
+    free(mpool->outline);
+    free(mpool->strokeOutline);
+    free(mpool->dashOutline);
+    free(mpool);
+
+    return true;
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwPostEffect.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwPostEffect.cpp
@@ -0,0 +1,589 @@
+/*
+ * Copyright (c) 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgMath.h"
+#include "tvgSwCommon.h"
+
+/************************************************************************/
+/* Gaussian Blur Implementation                                         */
+/************************************************************************/
+
+struct SwGaussianBlur
+{
+    static constexpr int MAX_LEVEL = 3;
+    int level;
+    int kernel[MAX_LEVEL];
+    int extends;
+};
+
+
+static inline int _gaussianEdgeWrap(int end, int idx)
+{
+    auto r = idx % (end + 1);
+    return (r < 0) ? (end + 1) + r : r;
+}
+
+
+static inline int _gaussianEdgeExtend(int end, int idx)
+{
+    if (idx < 0) return 0;
+    else if (idx > end) return end;
+    return idx;
+}
+
+
+template<int border>
+static inline int _gaussianRemap(int end, int idx)
+{
+    if (border == 1) return _gaussianEdgeWrap(end, idx);
+    return _gaussianEdgeExtend(end, idx);
+}
+
+
+//TODO: SIMD OPTIMIZATION?
+template<int border = 0>
+static void _gaussianFilter(uint8_t* dst, uint8_t* src, int32_t stride, int32_t w, int32_t h, const SwBBox& bbox, int32_t dimension, bool flipped)
+{
+    if (flipped) {
+        src += (bbox.min.x * stride + bbox.min.y) << 2;
+        dst += (bbox.min.x * stride + bbox.min.y) << 2;
+    } else {
+        src += (bbox.min.y * stride + bbox.min.x) << 2;
+        dst += (bbox.min.y * stride + bbox.min.x) << 2;
+    }
+
+    auto iarr = 1.0f / (dimension + dimension + 1);
+    auto end = w - 1;
+
+    #pragma omp parallel for
+    for (int y = 0; y < h; ++y) {
+        auto p = y * stride;
+        auto i = p * 4;                 //current index
+        auto l = -(dimension + 1);      //left index
+        auto r = dimension;             //right index
+        int acc[4] = {0, 0, 0, 0};      //sliding accumulator
+
+        //initial accumulation
+        for (int x = l; x < r; ++x) {
+            auto id = (_gaussianRemap<border>(end, x) + p) * 4;
+            acc[0] += src[id++];
+            acc[1] += src[id++];
+            acc[2] += src[id++];
+            acc[3] += src[id];
+        }
+        //perform filtering
+        for (int x = 0; x < w; ++x, ++r, ++l) {
+            auto rid = (_gaussianRemap<border>(end, r) + p) * 4;
+            auto lid = (_gaussianRemap<border>(end, l) + p) * 4;
+            acc[0] += src[rid++] - src[lid++];
+            acc[1] += src[rid++] - src[lid++];
+            acc[2] += src[rid++] - src[lid++];
+            acc[3] += src[rid] - src[lid];
+            //ignored rounding for the performance. It should be originally: acc[idx] * iarr + 0.5f
+            dst[i++] = static_cast<uint8_t>(acc[0] * iarr);
+            dst[i++] = static_cast<uint8_t>(acc[1] * iarr);
+            dst[i++] = static_cast<uint8_t>(acc[2] * iarr);
+            dst[i++] = static_cast<uint8_t>(acc[3] * iarr);
+        }
+    }
+}
+
+
+static int _gaussianInit(SwGaussianBlur* data, float sigma, int quality)
+{
+    const auto MAX_LEVEL = SwGaussianBlur::MAX_LEVEL;
+
+    if (tvg::zero(sigma)) return 0;
+
+    data->level = int(SwGaussianBlur::MAX_LEVEL * ((quality - 1) * 0.01f)) + 1;
+
+    //compute box kernel sizes
+    auto wl = (int) sqrt((12 * sigma / MAX_LEVEL) + 1);
+    if (wl % 2 == 0) --wl;
+    auto wu = wl + 2;
+    auto mi = (12 * sigma - MAX_LEVEL * wl * wl - 4 * MAX_LEVEL * wl - 3 * MAX_LEVEL) / (-4 * wl - 4);
+    auto m = int(mi + 0.5f);
+    auto extends = 0;
+
+    for (int i = 0; i < data->level; i++) {
+        data->kernel[i] = ((i < m ? wl : wu) - 1) / 2;
+        extends += data->kernel[i];
+    }
+
+    return extends;
+}
+
+
+bool effectGaussianBlurRegion(RenderEffectGaussianBlur* params)
+{
+    //bbox region expansion for feathering
+    auto& region = params->extend;
+    auto extra = static_cast<SwGaussianBlur*>(params->rd)->extends;
+
+    if (params->direction != 2) {
+        region.x = -extra;
+        region.w = extra * 2;
+    }
+    if (params->direction != 1) {
+        region.y = -extra;
+        region.h = extra * 2;
+    }
+
+    return true;
+}
+
+
+void effectGaussianBlurUpdate(RenderEffectGaussianBlur* params, const Matrix& transform)
+{
+    if (!params->rd) params->rd = (SwGaussianBlur*)malloc(sizeof(SwGaussianBlur));
+    auto rd = static_cast<SwGaussianBlur*>(params->rd);
+
+    //compute box kernel sizes
+    auto scale = sqrt(transform.e11 * transform.e11 + transform.e12 * transform.e12);
+    rd->extends = _gaussianInit(rd, std::pow(params->sigma * scale, 2), params->quality);
+
+    //invalid
+    if (rd->extends == 0) {
+        params->valid = false;
+        return;
+    }
+
+    params->valid = true;
+}
+
+
+bool effectGaussianBlur(SwCompositor* cmp, SwSurface* surface, const RenderEffectGaussianBlur* params)
+{
+    auto& buffer = surface->compositor->image;
+    auto data = static_cast<SwGaussianBlur*>(params->rd);
+    auto& bbox = cmp->bbox;
+    auto w = (bbox.max.x - bbox.min.x);
+    auto h = (bbox.max.y - bbox.min.y);
+    auto stride = cmp->image.stride;
+    auto front = cmp->image.buf32;
+    auto back = buffer.buf32;
+    auto swapped = false;
+
+    TVGLOG("SW_ENGINE", "GaussianFilter region(%ld, %ld, %ld, %ld) params(%f %d %d), level(%d)", bbox.min.x, bbox.min.y, bbox.max.x, bbox.max.y, params->sigma, params->direction, params->border, data->level);
+
+    /* It is best to take advantage of the Gaussian blur’s separable property
+       by dividing the process into two passes. horizontal and vertical.
+       We can expect fewer calculations. */
+
+    //horizontal
+    if (params->direction != 2) {
+        for (int i = 0; i < data->level; ++i) {
+            _gaussianFilter(reinterpret_cast<uint8_t*>(back), reinterpret_cast<uint8_t*>(front), stride, w, h, bbox, data->kernel[i], false);
+            std::swap(front, back);
+            swapped = !swapped;
+        }
+    }
+
+    //vertical. x/y flipping and horionztal access is pretty compatible with the memory architecture.
+    if (params->direction != 1) {
+        rasterXYFlip(front, back, stride, w, h, bbox, false);
+        std::swap(front, back);
+
+        for (int i = 0; i < data->level; ++i) {
+            _gaussianFilter(reinterpret_cast<uint8_t*>(back), reinterpret_cast<uint8_t*>(front), stride, h, w, bbox, data->kernel[i], true);
+            std::swap(front, back);
+            swapped = !swapped;
+        }
+
+        rasterXYFlip(front, back, stride, h, w, bbox, true);
+        std::swap(front, back);
+    }
+
+    if (swapped) std::swap(cmp->image.buf8, buffer.buf8);
+
+    return true;
+}
+
+/************************************************************************/
+/* Drop Shadow Implementation                                           */
+/************************************************************************/
+
+struct SwDropShadow : SwGaussianBlur
+{
+    SwPoint offset;
+};
+
+
+//TODO: SIMD OPTIMIZATION?
+static void _dropShadowFilter(uint32_t* dst, uint32_t* src, int stride, int w, int h, const SwBBox& bbox, int32_t dimension, uint32_t color, bool flipped)
+{
+    if (flipped) {
+        src += (bbox.min.x * stride + bbox.min.y);
+        dst += (bbox.min.x * stride + bbox.min.y);
+    } else {
+        src += (bbox.min.y * stride + bbox.min.x);
+        dst += (bbox.min.y * stride + bbox.min.x);
+    }
+    auto iarr = 1.0f / (dimension + dimension + 1);
+    auto end = w - 1;
+
+    #pragma omp parallel for
+    for (int y = 0; y < h; ++y) {
+        auto p = y * stride;
+        auto i = p;                     //current index
+        auto l = -(dimension + 1);      //left index
+        auto r = dimension;             //right index
+        int acc = 0;                    //sliding accumulator
+
+        //initial accumulation
+        for (int x = l; x < r; ++x) {
+            auto id = _gaussianEdgeExtend(end, x) + p;
+            acc += A(src[id]);
+        }
+        //perform filtering
+        for (int x = 0; x < w; ++x, ++r, ++l) {
+            auto rid = _gaussianEdgeExtend(end, r) + p;
+            auto lid = _gaussianEdgeExtend(end, l) + p;
+            acc += A(src[rid]) - A(src[lid]);
+            //ignored rounding for the performance. It should be originally: acc * iarr
+            dst[i++] = ALPHA_BLEND(color, static_cast<uint8_t>(acc * iarr));
+        }
+    }
+}
+
+
+static void _dropShadowShift(uint32_t* dst, uint32_t* src, int dstride, int sstride, SwBBox& region, SwPoint& offset, uint8_t opacity, bool direct)
+{
+    src += (region.min.y * sstride + region.min.x);
+    dst += (region.min.y * dstride + region.min.x);
+
+    auto w = region.max.x - region.min.x;
+    auto h = region.max.y - region.min.y;
+    auto translucent = (direct || opacity < 255);
+
+    //shift offset
+    if (region.min.x + offset.x < 0) src -= offset.x;
+    else dst += offset.x;
+
+    if (region.min.y + offset.y < 0) src -= (offset.y * sstride);
+    else dst += (offset.y * dstride);
+
+    for (auto y = 0; y < h; ++y) {
+        if (translucent) rasterTranslucentPixel32(dst, src, w, opacity);
+        else rasterPixel32(dst, src, w, opacity);
+        src += sstride;
+        dst += dstride;
+    }
+}
+
+
+bool effectDropShadowRegion(RenderEffectDropShadow* params)
+{
+    //bbox region expansion for feathering
+    auto& region = params->extend;
+    auto& offset = static_cast<SwDropShadow*>(params->rd)->offset;
+    auto extra = static_cast<SwDropShadow*>(params->rd)->extends;
+
+    region.x = -extra;
+    region.w = extra * 2;
+    region.y = -extra;
+    region.h = extra * 2;
+
+    region.x = std::min(region.x + (int32_t)offset.x, region.x);
+    region.y = std::min(region.y + (int32_t)offset.y, region.y);
+    region.w += abs(offset.x);
+    region.h += abs(offset.y);
+
+    return true;
+}
+
+
+void effectDropShadowUpdate(RenderEffectDropShadow* params, const Matrix& transform)
+{
+    if (!params->rd) params->rd = (SwDropShadow*)malloc(sizeof(SwDropShadow));
+    auto rd = static_cast<SwDropShadow*>(params->rd);
+
+    //compute box kernel sizes
+    auto scale = sqrt(transform.e11 * transform.e11 + transform.e12 * transform.e12);
+    rd->extends = _gaussianInit(rd, std::pow(params->sigma * scale, 2), params->quality);
+
+    //invalid
+    if (rd->extends == 0 || params->color[3] == 0) {
+        params->valid = false;
+        return;
+    }
+
+    //offset
+    if (params->distance > 0.0f) {
+        auto radian = tvg::deg2rad(90.0f - params->angle);
+        rd->offset = {(SwCoord)(params->distance * cosf(radian)), (SwCoord)(-1.0f * params->distance * sinf(radian))};
+    } else {
+        rd->offset = {0, 0};
+    }
+
+    params->valid = true;
+}
+
+
+//A quite same integration with effectGaussianBlur(). See it for detailed comments.
+//surface[0]: the original image, to overlay it into the filtered image.
+//surface[1]: temporary buffer for generating the filtered image.
+bool effectDropShadow(SwCompositor* cmp, SwSurface* surface[2], const RenderEffectDropShadow* params, bool direct)
+{
+    //FIXME: if the body is partially visible due to clipping, the shadow also becomes partially visible.
+
+    auto data = static_cast<SwDropShadow*>(params->rd);
+    auto& bbox = cmp->bbox;
+    auto w = (bbox.max.x - bbox.min.x);
+    auto h = (bbox.max.y - bbox.min.y);
+
+    //outside the screen
+    if (abs(data->offset.x) >= w || abs(data->offset.y) >= h) return true;
+
+    SwImage* buffer[] = {&surface[0]->compositor->image, &surface[1]->compositor->image};
+    auto color = cmp->recoverSfc->join(params->color[0], params->color[1], params->color[2], 255);
+    auto stride = cmp->image.stride;
+    auto front = cmp->image.buf32;
+    auto back = buffer[1]->buf32;
+
+    auto opacity = direct ? MULTIPLY(params->color[3], cmp->opacity) : params->color[3];
+
+    TVGLOG("SW_ENGINE", "DropShadow region(%ld, %ld, %ld, %ld) params(%f %f %f), level(%d)", bbox.min.x, bbox.min.y, bbox.max.x, bbox.max.y, params->angle, params->distance, params->sigma, data->level);
+
+    //saving the original image in order to overlay it into the filtered image.
+    _dropShadowFilter(back, front, stride, w, h, bbox, data->kernel[0], color, false);
+    std::swap(front, buffer[0]->buf32);
+    std::swap(front, back);
+
+    //horizontal
+    for (int i = 1; i < data->level; ++i) {
+        _dropShadowFilter(back, front, stride, w, h, bbox, data->kernel[i], color, false);
+        std::swap(front, back);
+    }
+
+    //vertical
+    rasterXYFlip(front, back, stride, w, h, bbox, false);
+    std::swap(front, back);
+
+    for (int i = 0; i < data->level; ++i) {
+        _dropShadowFilter(back, front, stride, h, w, bbox, data->kernel[i], color, true);
+        std::swap(front, back);
+    }
+
+    rasterXYFlip(front, back, stride, h, w, bbox, true);
+    std::swap(cmp->image.buf32, back);
+
+    //draw to the main surface directly
+    if (direct) {
+        _dropShadowShift(cmp->recoverSfc->buf32, cmp->image.buf32, cmp->recoverSfc->stride, stride, bbox, data->offset, opacity, direct);
+        std::swap(cmp->image.buf32, buffer[0]->buf32);
+        return true;
+    }
+
+    //draw to the intermediate surface
+    rasterClear(surface[1], bbox.min.x, bbox.min.y, w, h);
+    _dropShadowShift(buffer[1]->buf32, cmp->image.buf32, stride, stride, bbox, data->offset, opacity, direct);
+    std::swap(cmp->image.buf32, buffer[1]->buf32);
+
+    //compositing shadow and body
+    auto s = buffer[0]->buf32 + (bbox.min.y * buffer[0]->stride + bbox.min.x);
+    auto d = cmp->image.buf32 + (bbox.min.y * cmp->image.stride + bbox.min.x);
+
+    for (auto y = 0; y < h; ++y) {
+        rasterTranslucentPixel32(d, s, w, 255);
+        s += buffer[0]->stride;
+        d += cmp->image.stride;
+    }
+
+    return true;
+}
+
+
+/************************************************************************/
+/* Fill Implementation                                                  */
+/************************************************************************/
+
+void effectFillUpdate(RenderEffectFill* params)
+{
+    params->valid = true;
+}
+
+
+bool effectFill(SwCompositor* cmp, const RenderEffectFill* params, bool direct)
+{
+    auto opacity = direct ? MULTIPLY(params->color[3], cmp->opacity) : params->color[3];
+
+    auto& bbox = cmp->bbox;
+    auto w = size_t(bbox.max.x - bbox.min.x);
+    auto h = size_t(bbox.max.y - bbox.min.y);
+    auto color = cmp->recoverSfc->join(params->color[0], params->color[1], params->color[2], 255);
+
+    TVGLOG("SW_ENGINE", "Fill region(%ld, %ld, %ld, %ld), param(%d %d %d %d)", bbox.min.x, bbox.min.y, bbox.max.x, bbox.max.y, params->color[0], params->color[1], params->color[2], params->color[3]);
+
+    if (direct) {
+        auto dbuffer = cmp->recoverSfc->buf32 + (bbox.min.y * cmp->recoverSfc->stride + bbox.min.x);
+        auto sbuffer = cmp->image.buf32 + (bbox.min.y * cmp->image.stride + bbox.min.x);
+        for (size_t y = 0; y < h; ++y) {
+            auto dst = dbuffer;
+            auto src = sbuffer;
+            for (size_t x = 0; x < w; ++x, ++dst, ++src) {
+                auto a = MULTIPLY(opacity, A(*src));
+                auto tmp = ALPHA_BLEND(color, a);
+                *dst = tmp + ALPHA_BLEND(*dst, 255 - a);
+            }
+            dbuffer += cmp->image.stride;
+            sbuffer += cmp->recoverSfc->stride;
+        }
+        cmp->valid = true;  //no need the subsequent composition
+    } else {
+        auto dbuffer = cmp->image.buf32 + (bbox.min.y * cmp->image.stride + bbox.min.x);
+        for (size_t y = 0; y < h; ++y) {
+            auto dst = dbuffer;
+            for (size_t x = 0; x < w; ++x, ++dst) {
+                *dst = ALPHA_BLEND(color, MULTIPLY(opacity, A(*dst)));
+            }
+            dbuffer += cmp->image.stride;
+        }
+    }
+    return true;
+}
+
+
+/************************************************************************/
+/* Tint Implementation                                                  */
+/************************************************************************/
+
+void effectTintUpdate(RenderEffectTint* params)
+{
+    params->valid = true;
+}
+
+
+bool effectTint(SwCompositor* cmp, const RenderEffectTint* params, bool direct)
+{
+    auto& bbox = cmp->bbox;
+    auto w = size_t(bbox.max.x - bbox.min.x);
+    auto h = size_t(bbox.max.y - bbox.min.y);
+    auto black = cmp->recoverSfc->join(params->black[0], params->black[1], params->black[2], 255);
+    auto white = cmp->recoverSfc->join(params->white[0], params->white[1], params->white[2], 255);
+    auto opacity = cmp->opacity;
+    auto luma = cmp->recoverSfc->alphas[2];  //luma function
+
+    TVGLOG("SW_ENGINE", "Tint region(%ld, %ld, %ld, %ld), param(%d %d %d, %d %d %d, %d)", bbox.min.x, bbox.min.y, bbox.max.x, bbox.max.y, params->black[0], params->black[1], params->black[2], params->white[0], params->white[1], params->white[2], params->intensity);
+
+    /* Tint Formula: (1 - L) * Black + L * White, where the L is Luminance. */
+
+    if (direct) {
+        auto dbuffer = cmp->recoverSfc->buf32 + (bbox.min.y * cmp->recoverSfc->stride + bbox.min.x);
+        auto sbuffer = cmp->image.buf32 + (bbox.min.y * cmp->image.stride + bbox.min.x);
+        for (size_t y = 0; y < h; ++y) {
+            auto dst = dbuffer;
+            auto src = sbuffer;
+            for (size_t x = 0; x < w; ++x, ++dst, ++src) {
+                auto tmp = rasterUnpremultiply(*src);
+                auto val = INTERPOLATE(INTERPOLATE(black, white, luma((uint8_t*)&tmp)), tmp, params->intensity);
+                *dst = INTERPOLATE(val, *dst, MULTIPLY(opacity, A(tmp)));
+            }
+            dbuffer += cmp->image.stride;
+            sbuffer += cmp->recoverSfc->stride;
+        }
+        cmp->valid = true;  //no need the subsequent composition
+    } else {
+        auto dbuffer = cmp->image.buf32 + (bbox.min.y * cmp->image.stride + bbox.min.x);
+        for (size_t y = 0; y < h; ++y) {
+            auto dst = dbuffer;
+            for (size_t x = 0; x < w; ++x, ++dst) {
+                auto tmp = rasterUnpremultiply(*dst);
+                auto val = INTERPOLATE(INTERPOLATE(black, white, luma((uint8_t*)&tmp)), tmp, params->intensity);
+                *dst = ALPHA_BLEND(val, A(tmp));
+            }
+            dbuffer += cmp->image.stride;
+        }
+    }
+
+    return true;
+}
+
+
+/************************************************************************/
+/* Tritone Implementation                                              */
+/************************************************************************/
+
+static uint32_t _trintone(uint32_t s, uint32_t m, uint32_t h, int l)
+{
+    /* Tritone Formula:
+       if (L < 0.5) { (1 - 2L) * Shadow + 2L * Midtone }
+       else { (1 - 2(L - 0.5)) * Midtone + (2(L - 0.5)) * Highlight }
+       Where the L is Luminance. */
+
+    if (l < 128) {
+        auto a = std::min(l * 2, 255);
+        return ALPHA_BLEND(s, 255 - a) + ALPHA_BLEND(m, a);
+    } else {
+        auto a = 2 * std::max(0, l - 128);
+        return ALPHA_BLEND(m, 255 - a) + ALPHA_BLEND(h, a);
+    }
+}
+
+
+void effectTritoneUpdate(RenderEffectTritone* params)
+{
+    params->valid = true;
+}
+
+
+bool effectTritone(SwCompositor* cmp, const RenderEffectTritone* params, bool direct)
+{
+    auto& bbox = cmp->bbox;
+    auto w = size_t(bbox.max.x - bbox.min.x);
+    auto h = size_t(bbox.max.y - bbox.min.y);
+    auto shadow = cmp->recoverSfc->join(params->shadow[0], params->shadow[1], params->shadow[2], 255);
+    auto midtone = cmp->recoverSfc->join(params->midtone[0], params->midtone[1], params->midtone[2], 255);
+    auto highlight = cmp->recoverSfc->join(params->highlight[0], params->highlight[1], params->highlight[2], 255);
+    auto opacity = cmp->opacity;
+    auto luma = cmp->recoverSfc->alphas[2];  //luma function
+
+    TVGLOG("SW_ENGINE", "Tritone region(%ld, %ld, %ld, %ld), param(%d %d %d, %d %d %d, %d %d %d)", bbox.min.x, bbox.min.y, bbox.max.x, bbox.max.y, params->shadow[0], params->shadow[1], params->shadow[2], params->midtone[0], params->midtone[1], params->midtone[2], params->highlight[0], params->highlight[1], params->highlight[2]);
+
+    if (direct) {
+        auto dbuffer = cmp->recoverSfc->buf32 + (bbox.min.y * cmp->recoverSfc->stride + bbox.min.x);
+        auto sbuffer = cmp->image.buf32 + (bbox.min.y * cmp->image.stride + bbox.min.x);
+        for (size_t y = 0; y < h; ++y) {
+            auto dst = dbuffer;
+            auto src = sbuffer;
+            for (size_t x = 0; x < w; ++x, ++dst, ++src) {
+                auto tmp = rasterUnpremultiply(*src);
+                *dst = INTERPOLATE(_trintone(shadow, midtone, highlight, luma((uint8_t*)&tmp)), *dst, MULTIPLY(opacity, A(tmp)));
+            }
+            dbuffer += cmp->image.stride;
+            sbuffer += cmp->recoverSfc->stride;
+        }
+        cmp->valid = true;  //no need the subsequent composition
+    } else {
+        auto dbuffer = cmp->image.buf32 + (bbox.min.y * cmp->image.stride + bbox.min.x);
+        for (size_t y = 0; y < h; ++y) {
+            auto dst = dbuffer;
+            for (size_t x = 0; x < w; ++x, ++dst) {
+                auto tmp = rasterUnpremultiply(*dst);
+                *dst = ALPHA_BLEND(_trintone(shadow, midtone, highlight, luma((uint8_t*)&tmp)), A(tmp));
+            }
+            dbuffer += cmp->image.stride;
+        }
+    }
+
+    return true;
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRaster.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRaster.cpp
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterAvx.h
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterAvx.h
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2021 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef THORVG_AVX_VECTOR_SUPPORT
+
+#include <immintrin.h>
+
+#define N_32BITS_IN_128REG 4
+#define N_32BITS_IN_256REG 8
+
+static inline __m128i ALPHA_BLEND(__m128i c, __m128i a)
+{
+    //1. set the masks for the A/G and R/B channels
+    auto AG = _mm_set1_epi32(0xff00ff00);
+    auto RB = _mm_set1_epi32(0x00ff00ff);
+
+    //2. mask the alpha vector - originally quartet [a, a, a, a]
+    auto aAG = _mm_and_si128(a, AG);
+    auto aRB = _mm_and_si128(a, RB);
+
+    //3. calculate the alpha blending of the 2nd and 4th channel
+    //- mask the color vector
+    //- multiply it by the masked alpha vector
+    //- add the correction to compensate bit shifting used instead of dividing by 255
+    //- shift bits - corresponding to division by 256
+    auto even = _mm_and_si128(c, RB);
+    even = _mm_mullo_epi16(even, aRB);
+    even =_mm_add_epi16(even, RB);
+    even = _mm_srli_epi16(even, 8);
+
+    //4. calculate the alpha blending of the 1st and 3rd channel:
+    //- mask the color vector
+    //- multiply it by the corresponding masked alpha vector and store the high bits of the result
+    //- add the correction to compensate division by 256 instead of by 255 (next step)
+    //- remove the low 8 bits to mimic the division by 256
+    auto odd = _mm_and_si128(c, AG);
+    odd = _mm_mulhi_epu16(odd, aAG);
+    odd = _mm_add_epi16(odd, RB);
+    odd = _mm_and_si128(odd, AG);
+
+    //5. the final result
+    return _mm_or_si128(odd, even);
+}
+
+
+static void avxRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int32_t len) 
+{
+    dst += offset; 
+
+    __m256i vecVal = _mm256_set1_epi8(val);
+
+    int32_t i = 0;
+    for (; i <= len - 32; i += 32) {
+        _mm256_storeu_si256((__m256i*)(dst + i), vecVal);
+    }
+
+    for (; i < len; ++i) {
+        dst[i] = val;
+    }
+}
+
+
+static void avxRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
+{
+    //1. calculate how many iterations we need to cover the length
+    uint32_t iterations = len / N_32BITS_IN_256REG;
+    uint32_t avxFilled = iterations * N_32BITS_IN_256REG;
+
+    //2. set the beginning of the array
+    dst += offset;
+
+    //3. fill the octets
+    for (uint32_t i = 0; i < iterations; ++i, dst += N_32BITS_IN_256REG) {
+        _mm256_storeu_si256((__m256i*)dst, _mm256_set1_epi32(val));
+    }
+
+    //4. fill leftovers (in the first step we have to set the pointer to the place where the avx job is done)
+    int32_t leftovers = len - avxFilled;
+    while (leftovers--) *dst++ = val;
+}
+
+
+static bool avxRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+
+    //32bits channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, a);
+        auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+
+        uint32_t ialpha = 255 - a;
+
+        auto avxColor = _mm_set1_epi32(color);
+        auto avxIalpha = _mm_set1_epi8(ialpha);
+
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+
+            //1. fill the not aligned memory (for 128-bit registers a 16-bytes alignment is required)
+            auto notAligned = ((uintptr_t)dst & 0xf) / 4;
+            if (notAligned) {
+                notAligned = (N_32BITS_IN_128REG - notAligned > w ? w : N_32BITS_IN_128REG - notAligned);
+                for (uint32_t x = 0; x < notAligned; ++x, ++dst) {
+                    *dst = color + ALPHA_BLEND(*dst, ialpha);
+                }
+            }
+
+            //2. fill the aligned memory - N_32BITS_IN_128REG pixels processed at once
+            uint32_t iterations = (w - notAligned) / N_32BITS_IN_128REG;
+            uint32_t avxFilled = iterations * N_32BITS_IN_128REG;
+            auto avxDst = (__m128i*)dst;
+            for (uint32_t x = 0; x < iterations; ++x, ++avxDst) {
+                *avxDst = _mm_add_epi32(avxColor, ALPHA_BLEND(*avxDst, avxIalpha));
+            }
+
+            //3. fill the remaining pixels
+            int32_t leftovers = w - notAligned - avxFilled;
+            dst += avxFilled;
+            while (leftovers--) {
+                *dst = color + ALPHA_BLEND(*dst, ialpha);
+                dst++;
+            }
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        TVGLOG("SW_ENGINE", "Require AVX Optimization, Channel Size = %d", surface->channelSize);
+        auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
+        auto ialpha = ~a;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+            for (uint32_t x = 0; x < w; ++x, ++dst) {
+                *dst = a + MULTIPLY(*dst, ialpha);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool avxRasterTranslucentRle(SwSurface* surface, const SwRle* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto span = rle->spans;
+
+    //32bit channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, a);
+        uint32_t src;
+
+        for (uint32_t i = 0; i < rle->size; ++i) {
+            auto dst = &surface->buf32[span->y * surface->stride + span->x];
+
+            if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
+            else src = color;
+
+        auto ialpha = IA(src);
+
+            //1. fill the not aligned memory (for 128-bit registers a 16-bytes alignment is required)
+            auto notAligned = ((uintptr_t)dst & 0xf) / 4;
+            if (notAligned) {
+                notAligned = (N_32BITS_IN_128REG - notAligned > span->len ? span->len : N_32BITS_IN_128REG - notAligned);
+                for (uint32_t x = 0; x < notAligned; ++x, ++dst) {
+                    *dst = src + ALPHA_BLEND(*dst, ialpha);
+                }
+            }
+
+            //2. fill the aligned memory using avx - N_32BITS_IN_128REG pixels processed at once
+            //In order to avoid unnecessary avx variables declarations a check is made whether there are any iterations at all
+            uint32_t iterations = (span->len - notAligned) / N_32BITS_IN_128REG;
+            uint32_t avxFilled = 0;
+            if (iterations > 0) {
+                auto avxSrc = _mm_set1_epi32(src);
+                auto avxIalpha = _mm_set1_epi8(ialpha);
+
+                avxFilled = iterations * N_32BITS_IN_128REG;
+                auto avxDst = (__m128i*)dst;
+                for (uint32_t x = 0; x < iterations; ++x, ++avxDst) {
+                    *avxDst = _mm_add_epi32(avxSrc, ALPHA_BLEND(*avxDst, avxIalpha));
+                }
+            }
+
+            //3. fill the remaining pixels
+            int32_t leftovers = span->len - notAligned - avxFilled;
+            dst += avxFilled;
+            while (leftovers--) {
+                *dst = src + ALPHA_BLEND(*dst, ialpha);
+                dst++;
+            }
+
+            ++span;
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        TVGLOG("SW_ENGINE", "Require AVX Optimization, Channel Size = %d", surface->channelSize);
+        uint8_t src;
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf8[span->y * surface->stride + span->x];
+            if (span->coverage < 255) src = MULTIPLY(span->coverage, a);
+            else src = a;
+            auto ialpha = ~a;
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = src + MULTIPLY(*dst, ialpha);
+            }
+        }
+    }
+    return true;
+}
+
+
+#endif
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterC.h
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterC.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2021 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+template<typename PIXEL_T>
+static void inline cRasterTranslucentPixels(PIXEL_T* dst, PIXEL_T* src, uint32_t len, uint32_t opacity)
+{
+    //TODO: 64bits faster?
+    if (opacity == 255) {
+        for (uint32_t x = 0; x < len; ++x, ++dst, ++src) {
+            *dst = *src + ALPHA_BLEND(*dst, IA(*src));
+        }
+    } else {
+        for (uint32_t x = 0; x < len; ++x, ++dst, ++src) {
+            auto tmp = ALPHA_BLEND(*src, opacity);
+            *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+        }
+    }
+}
+
+
+template<typename PIXEL_T>
+static void inline cRasterPixels(PIXEL_T* dst, PIXEL_T* src, uint32_t len, uint32_t opacity)
+{
+    //TODO: 64bits faster?
+    if (opacity == 255) {
+        for (uint32_t x = 0; x < len; ++x, ++dst, ++src) {
+            *dst = *src;
+        }
+    } else {
+        cRasterTranslucentPixels(dst, src, len, opacity);
+    }
+}
+
+
+template<typename PIXEL_T>
+static void inline cRasterPixels(PIXEL_T* dst, PIXEL_T val, uint32_t offset, int32_t len)
+{
+    dst += offset;
+
+    //fix the misaligned memory
+    auto alignOffset = (long long) dst % 8;
+    if (alignOffset > 0) {
+        if (sizeof(PIXEL_T) == 4) alignOffset /= 4;
+        else if (sizeof(PIXEL_T) == 1) alignOffset = 8 - alignOffset;
+        while (alignOffset > 0 && len > 0) {
+            *dst++ = val;
+            --len;
+            --alignOffset;
+        }
+    }
+
+    //64bits faster clear
+    if ((sizeof(PIXEL_T) == 4)) {
+        auto val64 = (uint64_t(val) << 32) | uint64_t(val);
+        while (len > 1) {
+            *reinterpret_cast<uint64_t*>(dst) = val64;
+            len -= 2;
+            dst += 2;
+        }
+    } else if (sizeof(PIXEL_T) == 1) {
+        auto val32 = (uint32_t(val) << 24) | (uint32_t(val) << 16) | (uint32_t(val) << 8) | uint32_t(val);
+        auto val64 = (uint64_t(val32) << 32) | val32;
+        while (len > 7) {
+            *reinterpret_cast<uint64_t*>(dst) = val64;
+            len -= 8;
+            dst += 8;
+        }
+    }
+
+    //leftovers
+    while (len--) *dst++ = val;
+}
+
+
+static bool inline cRasterTranslucentRle(SwSurface* surface, const SwRle* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto span = rle->spans;
+
+    //32bit channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, a);
+        uint32_t src;
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf32[span->y * surface->stride + span->x];
+            if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
+            else src = color;
+            auto ialpha = IA(src);
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = src + ALPHA_BLEND(*dst, ialpha);
+            }
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        uint8_t src;
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf8[span->y * surface->stride + span->x];
+            if (span->coverage < 255) src = MULTIPLY(span->coverage, a);
+            else src = a;
+            auto ialpha = ~a;
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = src + MULTIPLY(*dst, ialpha);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool inline cRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+
+    //32bits channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, a);
+        auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+        auto ialpha = 255 - a;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+            for (uint32_t x = 0; x < w; ++x, ++dst) {
+                *dst = color + ALPHA_BLEND(*dst, ialpha);
+            }
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
+        auto ialpha = ~a;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+            for (uint32_t x = 0; x < w; ++x, ++dst) {
+                *dst = a + MULTIPLY(*dst, ialpha);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool inline cRasterABGRtoARGB(RenderSurface* surface)
+{
+    TVGLOG("SW_ENGINE", "Convert ColorSpace ABGR - ARGB [Size: %d x %d]", surface->w, surface->h);
+
+    //64bits faster converting
+    if (surface->w % 2 == 0) {
+        auto buffer = reinterpret_cast<uint64_t*>(surface->buf32);
+        for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride / 2) {
+            auto dst = buffer;
+            for (uint32_t x = 0; x < surface->w / 2; ++x, ++dst) {
+                auto c = *dst;
+                //flip Blue, Red channels
+                *dst = (c & 0xff000000ff000000) + ((c & 0x00ff000000ff0000) >> 16) + (c & 0x0000ff000000ff00) + ((c & 0x000000ff000000ff) << 16);
+            }
+        }
+    //default converting
+    } else {
+        auto buffer = surface->buf32;
+        for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) {
+            auto dst = buffer;
+            for (uint32_t x = 0; x < surface->w; ++x, ++dst) {
+                auto c = *dst;
+                //flip Blue, Red channels
+                *dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool inline cRasterARGBtoABGR(RenderSurface* surface)
+{
+    //exactly same with ABGRtoARGB
+    return cRasterABGRtoARGB(surface);
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterNeon.h
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterNeon.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2021 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef THORVG_NEON_VECTOR_SUPPORT
+
+#include <arm_neon.h>
+
+//TODO : need to support windows ARM
+ 
+#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
+#define TVG_AARCH64 1
+#else
+#define TVG_AARCH64 0
+#endif
+
+
+static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a)
+{
+    uint16x8_t t = vmull_u8(c, a);
+    return vshrn_n_u16(t, 8);
+}
+
+
+static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int32_t len)
+{
+    dst += offset;
+
+    int32_t i = 0;
+    const uint8x16_t valVec = vdupq_n_u8(val);
+#if TVG_AARCH64
+    uint8x16x4_t valQuad = {valVec, valVec, valVec, valVec};
+    for (; i <= len - 16 * 4; i += 16 * 4) {
+        vst1q_u8_x4(dst + i, valQuad);
+    }
+#else
+    for (; i <= len - 16; i += 16) {
+        vst1q_u8(dst + i, valVec);
+    }
+#endif
+    for (; i < len; i++) {
+        dst[i] = val;
+    }
+}
+
+
+static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
+{
+    dst += offset;
+
+    uint32x4_t vectorVal = vdupq_n_u32(val);
+
+#if TVG_AARCH64
+    uint32_t iterations = len / 16;
+    uint32_t neonFilled = iterations * 16;
+    uint32x4x4_t valQuad = {vectorVal, vectorVal, vectorVal, vectorVal};
+    for (uint32_t i = 0; i < iterations; ++i) {
+        vst4q_u32(dst, valQuad);
+        dst += 16;
+    }
+#else
+    uint32_t iterations = len / 4;
+    uint32_t neonFilled = iterations * 4;
+    for (uint32_t i = 0; i < iterations; ++i) {
+        vst1q_u32(dst, vectorVal);
+        dst += 4;
+    }
+#endif
+    int32_t leftovers = len - neonFilled;
+    while (leftovers--) *dst++ = val;
+}
+
+
+static bool neonRasterTranslucentRle(SwSurface* surface, const SwRle* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto span = rle->spans;
+
+    //32bit channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, a);
+        uint32_t src;
+        uint8x8_t *vDst = nullptr;
+        uint16_t align;
+
+        for (uint32_t i = 0; i < rle->size; ++i) {
+            if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
+            else src = color;
+
+            auto dst = &surface->buf32[span->y * surface->stride + span->x];
+            auto ialpha = IA(src);
+
+            if ((((uintptr_t) dst) & 0x7) != 0) {
+                //fill not aligned byte
+                *dst = src + ALPHA_BLEND(*dst, ialpha);
+                vDst = (uint8x8_t*)(dst + 1);
+                align = 1;
+            } else {
+                vDst = (uint8x8_t*) dst;
+                align = 0;
+            }
+
+            uint8x8_t vSrc = (uint8x8_t) vdup_n_u32(src);
+            uint8x8_t vIalpha = vdup_n_u8((uint8_t) ialpha);
+
+            for (uint32_t x = 0; x < (span->len - align) / 2; ++x)
+                vDst[x] = vadd_u8(vSrc, ALPHA_BLEND(vDst[x], vIalpha));
+
+            auto leftovers = (span->len - align) % 2;
+            if (leftovers > 0) dst[span->len - 1] = src + ALPHA_BLEND(dst[span->len - 1], ialpha);
+
+            ++span;
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        TVGLOG("SW_ENGINE", "Require Neon Optimization, Channel Size = %d", surface->channelSize);
+        uint8_t src;
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf8[span->y * surface->stride + span->x];
+            if (span->coverage < 255) src = MULTIPLY(span->coverage, a);
+            else src = a;
+            auto ialpha = ~a;
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = src + MULTIPLY(*dst, ialpha);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool neonRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+
+    //32bits channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, a);
+        auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+        auto ialpha = 255 - a;
+
+        auto vColor = vdup_n_u32(color);
+        auto vIalpha = vdup_n_u8((uint8_t) ialpha);
+
+        uint8x8_t* vDst = nullptr;
+        uint32_t align;
+
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+
+            if ((((uintptr_t) dst) & 0x7) != 0) {
+                //fill not aligned byte
+                *dst = color + ALPHA_BLEND(*dst, ialpha);
+                vDst = (uint8x8_t*) (dst + 1);
+                align = 1;
+            } else {
+                vDst = (uint8x8_t*) dst;
+                align = 0;
+            }
+
+            for (uint32_t x = 0; x <  (w - align) / 2; ++x)
+                vDst[x] = vadd_u8((uint8x8_t)vColor, ALPHA_BLEND(vDst[x], vIalpha));
+
+            auto leftovers = (w - align) % 2;
+            if (leftovers > 0) dst[w - 1] = color + ALPHA_BLEND(dst[w - 1], ialpha);
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        TVGLOG("SW_ENGINE", "Require Neon Optimization, Channel Size = %d", surface->channelSize);
+        auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
+        auto ialpha = ~a;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+            for (uint32_t x = 0; x < w; ++x, ++dst) {
+                *dst = a + MULTIPLY(*dst, ialpha);
+            }
+        }
+    }
+    return true;
+}
+
+#endif
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterTexmap.h
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterTexmap.h
@@ -0,0 +1,962 @@
+/*
+ * Copyright (c) 2021 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+struct Vertex
+{
+   Point pt;
+   Point uv;
+};
+
+struct Polygon
+{
+   Vertex vertex[3];
+};
+
+struct AALine
+{
+   int32_t x[2];
+   int32_t coverage[2];
+   int32_t length[2];
+};
+
+struct AASpans
+{
+   AALine *lines;
+   int32_t yStart;
+   int32_t yEnd;
+};
+
+//Careful! Shared resource, No support threading
+static float dudx, dvdx;
+static float dxdya, dxdyb, dudya, dvdya;
+static float xa, xb, ua, va;
+
+
+//Y Range exception handling
+static bool _arrange(const SwImage* image, const SwBBox* region, int& yStart, int& yEnd)
+{
+    int32_t regionTop, regionBottom;
+
+    if (region) {
+        regionTop = region->min.y;
+        regionBottom = region->max.y;
+    } else {
+        regionTop = image->rle->spans->y;
+        regionBottom = image->rle->spans[image->rle->size - 1].y;
+    }
+
+    if (yStart >= regionBottom) return false;
+
+    if (yStart < regionTop) yStart = regionTop;
+    if (yEnd > regionBottom) yEnd = regionBottom;
+
+    return true;
+}
+
+
+static bool _rasterMaskedPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, AASpans* aaSpans, uint8_t opacity, uint8_t dirFlag = 0)
+{
+    TVGERR("SW_ENGINE", "TODO: _rasterMaskedPolygonImageSegment()");
+    return false;
+}
+
+
+static void _rasterBlendingPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, AASpans* aaSpans, uint8_t opacity)
+{
+    float _dudx = dudx, _dvdx = dvdx;
+    float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya;
+    float _xa = xa, _xb = xb, _ua = ua, _va = va;
+    auto sbuf = image->buf32;
+    auto dbuf = surface->buf32;
+    int32_t sw = static_cast<int32_t>(image->w);
+    int32_t sh = static_cast<int32_t>(image->h);
+    int32_t x1, x2, x, y, ar, ab, iru, irv, px, ay;
+    int32_t vv = 0, uu = 0;
+    int32_t minx = INT32_MAX, maxx = 0;
+    float dx, u, v, iptr;
+    uint32_t* buf;
+    SwSpan* span = nullptr;         //used only when rle based.
+
+    if (!_arrange(image, region, yStart, yEnd)) return;
+
+    //Loop through all lines in the segment
+    uint32_t spanIdx = 0;
+
+    if (region) {
+        minx = region->min.x;
+        maxx = region->max.x;
+    } else {
+        span = image->rle->spans;
+        while (span->y < yStart) {
+            ++span;
+            ++spanIdx;
+        }
+    }
+
+    y = yStart;
+
+    while (y < yEnd) {
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
+
+        if (!region) {
+            minx = INT32_MAX;
+            maxx = 0;
+            //one single row, could be consisted of multiple spans.
+            while (span->y == y && spanIdx < image->rle->size) {
+                if (minx > span->x) minx = span->x;
+                if (maxx < span->x + span->len) maxx = span->x + span->len;
+                ++span;
+                ++spanIdx;
+            }
+        }
+        if (x1 < minx) x1 = minx;
+        if (x2 > maxx) x2 = maxx;
+
+        //Anti-Aliasing frames
+        ay = y - aaSpans->yStart;
+        if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1;
+        if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2;
+
+        //Range allowed
+        if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) {
+
+            //Perform subtexel pre-stepping on UV
+            dx = 1 - (_xa - x1);
+            u = _ua + dx * _dudx;
+            v = _va + dx * _dvdx;
+
+            buf = dbuf + ((y * surface->stride) + x1);
+
+            x = x1;
+
+            if (opacity == 255) {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    if ((uint32_t) uu >= image->w || (uint32_t) vv >= image->h) continue;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * image->stride) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * image->stride) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * image->stride) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * image->stride) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    *buf = surface->blender(px, *buf, IA(px));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                }
+            } else {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    if ((uint32_t) uu >= image->w || (uint32_t) vv >= image->h) continue;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * image->stride) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * image->stride) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * image->stride) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * image->stride) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    auto src = ALPHA_BLEND(px, opacity);
+                    *buf = surface->blender(src, *buf, IA(src));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                }
+            }
+        }
+
+        //Step along both edges
+        _xa += _dxdya;
+        _xb += _dxdyb;
+        _ua += _dudya;
+        _va += _dvdya;
+
+        if (!region && spanIdx >= image->rle->size) break;
+
+        ++y;
+    }
+    xa = _xa;
+    xb = _xb;
+    ua = _ua;
+    va = _va;
+}
+
+
+static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, AASpans* aaSpans, uint8_t opacity, bool matting)
+{
+    float _dudx = dudx, _dvdx = dvdx;
+    float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya;
+    float _xa = xa, _xb = xb, _ua = ua, _va = va;
+    auto sbuf = image->buf32;
+    auto dbuf = surface->buf32;
+    int32_t sw = static_cast<int32_t>(image->w);
+    int32_t sh = static_cast<int32_t>(image->h);
+    int32_t x1, x2, x, y, ar, ab, iru, irv, px, ay;
+    int32_t vv = 0, uu = 0;
+    int32_t minx = INT32_MAX, maxx = 0;
+    float dx, u, v, iptr;
+    uint32_t* buf;
+    SwSpan* span = nullptr;         //used only when rle based.
+
+    //for matting(composition)
+    auto csize = matting ? surface->compositor->image.channelSize: 0;
+    auto alpha = matting ? surface->alpha(surface->compositor->method) : nullptr;
+    uint8_t* cmp = nullptr;
+
+    if (!_arrange(image, region, yStart, yEnd)) return;
+
+    //Loop through all lines in the segment
+    uint32_t spanIdx = 0;
+
+    if (region) {
+        minx = region->min.x;
+        maxx = region->max.x;
+    } else {
+        span = image->rle->spans;
+        while (span->y < yStart) {
+            ++span;
+            ++spanIdx;
+        }
+    }
+
+    y = yStart;
+
+    while (y < yEnd) {
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
+
+        if (!region) {
+            minx = INT32_MAX;
+            maxx = 0;
+            //one single row, could be consisted of multiple spans.
+            while (span->y == y && spanIdx < image->rle->size) {
+                if (minx > span->x) minx = span->x;
+                if (maxx < span->x + span->len) maxx = span->x + span->len;
+                ++span;
+                ++spanIdx;
+            }
+        }
+        if (x1 < minx) x1 = minx;
+        if (x2 > maxx) x2 = maxx;
+
+        //Anti-Aliasing frames
+        ay = y - aaSpans->yStart;
+        if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1;
+        if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2;
+
+        //Range allowed
+        if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) {
+
+            //Perform subtexel pre-stepping on UV
+            dx = 1 - (_xa - x1);
+            u = _ua + dx * _dudx;
+            v = _va + dx * _dvdx;
+
+            buf = dbuf + ((y * surface->stride) + x1);
+
+            x = x1;
+
+            if (matting) cmp = &surface->compositor->image.buf8[(y * surface->compositor->image.stride + x1) * csize];
+
+            if (opacity == 255) {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    if ((uint32_t) uu >= image->w || (uint32_t) vv >= image->h) continue;
+
+                    ar = (int)(255.0f * (1.0f - modff(u, &iptr)));
+                    ab = (int)(255.0f * (1.0f - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * image->stride) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * image->stride) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * image->stride) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * image->stride) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    uint32_t src;
+                    if (matting) {
+                        src = ALPHA_BLEND(px, alpha(cmp));
+                        cmp += csize;
+                    } else {
+                        src = px;
+                    }
+                    *buf = src + ALPHA_BLEND(*buf, IA(src));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                }
+            } else {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    if ((uint32_t) uu >= image->w || (uint32_t) vv >= image->h) continue;
+
+                    ar = (int)(255.0f * (1.0f - modff(u, &iptr)));
+                    ab = (int)(255.0f * (1.0f - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * image->stride) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * image->stride) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * image->stride) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    uint32_t src;
+                    if (matting) {
+                        src = ALPHA_BLEND(px, MULTIPLY(opacity, alpha(cmp)));
+                        cmp += csize;
+                    } else {
+                        src = ALPHA_BLEND(px, opacity);
+                    }
+                    *buf = src + ALPHA_BLEND(*buf, IA(src));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                }
+            }
+        }
+
+        //Step along both edges
+        _xa += _dxdya;
+        _xb += _dxdyb;
+        _ua += _dudya;
+        _va += _dvdya;
+
+        if (!region && spanIdx >= image->rle->size) break;
+
+        ++y;
+    }
+    xa = _xa;
+    xb = _xb;
+    ua = _ua;
+    va = _va;
+}
+
+
+/* This mapping algorithm is based on Mikael Kalms's. */
+static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const SwBBox* region, Polygon& polygon, AASpans* aaSpans, uint8_t opacity)
+{
+    float x[3] = {polygon.vertex[0].pt.x, polygon.vertex[1].pt.x, polygon.vertex[2].pt.x};
+    float y[3] = {polygon.vertex[0].pt.y, polygon.vertex[1].pt.y, polygon.vertex[2].pt.y};
+    float u[3] = {polygon.vertex[0].uv.x, polygon.vertex[1].uv.x, polygon.vertex[2].uv.x};
+    float v[3] = {polygon.vertex[0].uv.y, polygon.vertex[1].uv.y, polygon.vertex[2].uv.y};
+
+    float off_y;
+    float dxdy[3] = {0.0f, 0.0f, 0.0f};
+
+    auto upper = false;
+
+    //Sort the vertices in ascending Y order
+    if (y[0] > y[1]) {
+        std::swap(x[0], x[1]);
+        std::swap(y[0], y[1]);
+        std::swap(u[0], u[1]);
+        std::swap(v[0], v[1]);
+    }
+    if (y[0] > y[2])  {
+        std::swap(x[0], x[2]);
+        std::swap(y[0], y[2]);
+        std::swap(u[0], u[2]);
+        std::swap(v[0], v[2]);
+    }
+    if (y[1] > y[2]) {
+        std::swap(x[1], x[2]);
+        std::swap(y[1], y[2]);
+        std::swap(u[1], u[2]);
+        std::swap(v[1], v[2]);
+    }
+
+    //Y indexes
+    int yi[3] = {(int)y[0], (int)y[1], (int)y[2]};
+
+    //Skip drawing if it's too thin to cover any pixels at all.
+    if ((yi[0] == yi[1] && yi[0] == yi[2]) || ((int) x[0] == (int) x[1] && (int) x[0] == (int) x[2])) return;
+
+    //Calculate horizontal and vertical increments for UV axes (these calcs are certainly not optimal, although they're stable (handles any dy being 0)
+    auto denom = ((x[2] - x[0]) * (y[1] - y[0]) - (x[1] - x[0]) * (y[2] - y[0]));
+
+    //Skip poly if it's an infinitely thin line
+    if (tvg::zero(denom)) return;
+
+    denom = 1 / denom;   //Reciprocal for speeding up
+    dudx = ((u[2] - u[0]) * (y[1] - y[0]) - (u[1] - u[0]) * (y[2] - y[0])) * denom;
+    dvdx = ((v[2] - v[0]) * (y[1] - y[0]) - (v[1] - v[0]) * (y[2] - y[0])) * denom;
+    auto dudy = ((u[1] - u[0]) * (x[2] - x[0]) - (u[2] - u[0]) * (x[1] - x[0])) * denom;
+    auto dvdy = ((v[1] - v[0]) * (x[2] - x[0]) - (v[2] - v[0]) * (x[1] - x[0])) * denom;
+
+    //Calculate X-slopes along the edges
+    if (y[1] > y[0]) dxdy[0] = (x[1] - x[0]) / (y[1] - y[0]);
+    if (y[2] > y[0]) dxdy[1] = (x[2] - x[0]) / (y[2] - y[0]);
+    if (y[2] > y[1]) dxdy[2] = (x[2] - x[1]) / (y[2] - y[1]);
+
+    //Determine which side of the polygon the longer edge is on
+    auto side = (dxdy[1] > dxdy[0]) ? true : false;
+
+    if (tvg::equal(y[0], y[1])) side = x[0] > x[1];
+    if (tvg::equal(y[1], y[2])) side = x[2] > x[1];
+
+    auto regionTop = region ? region->min.y : image->rle->spans->y;  //Normal Image or Rle Image?
+    auto compositing = _compositing(surface);   //Composition required
+    auto blending = _blending(surface);         //Blending required
+
+    //Longer edge is on the left side
+    if (!side) {
+        //Calculate slopes along left edge
+        dxdya = dxdy[1];
+        dudya = dxdya * dudx + dudy;
+        dvdya = dxdya * dvdx + dvdy;
+
+        //Perform subpixel pre-stepping along left edge
+        auto dy = 1.0f - (y[0] - yi[0]);
+        xa = x[0] + dy * dxdya;
+        ua = u[0] + dy * dudya;
+        va = v[0] + dy * dvdya;
+
+        //Draw upper segment if possibly visible
+        if (yi[0] < yi[1]) {
+            off_y = y[0] < regionTop ? (regionTop - y[0]) : 0;
+            xa += (off_y * dxdya);
+            ua += (off_y * dudya);
+            va += (off_y * dvdya);
+
+            // Set right edge X-slope and perform subpixel pre-stepping
+            dxdyb = dxdy[0];
+            xb = x[0] + dy * dxdyb + (off_y * dxdyb);
+
+            if (compositing) {
+                if (_matting(surface)) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, true);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, 1);
+            } else if (blending) {
+                _rasterBlendingPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity);
+            } else {
+                _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, false);
+            }
+            upper = true;
+        }
+        //Draw lower segment if possibly visible
+        if (yi[1] < yi[2]) {
+            off_y = y[1] < regionTop ? (regionTop - y[1]) : 0;
+            if (!upper) {
+                xa += (off_y * dxdya);
+                ua += (off_y * dudya);
+                va += (off_y * dvdya);
+            }
+            // Set right edge X-slope and perform subpixel pre-stepping
+            dxdyb = dxdy[2];
+            xb = x[1] + (1 - (y[1] - yi[1])) * dxdyb + (off_y * dxdyb);
+            if (compositing) {
+                if (_matting(surface)) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, true);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, 2);
+            } else if (blending) {
+                 _rasterBlendingPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity);
+            } else {
+                _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, false);
+            }
+        }
+    //Longer edge is on the right side
+    } else {
+        //Set right edge X-slope and perform subpixel pre-stepping
+        dxdyb = dxdy[1];
+        auto dy = 1.0f - (y[0] - yi[0]);
+        xb = x[0] + dy * dxdyb;
+
+        //Draw upper segment if possibly visible
+        if (yi[0] < yi[1]) {
+            off_y = y[0] < regionTop ? (regionTop - y[0]) : 0;
+            xb += (off_y *dxdyb);
+
+            // Set slopes along left edge and perform subpixel pre-stepping
+            dxdya = dxdy[0];
+            dudya = dxdya * dudx + dudy;
+            dvdya = dxdya * dvdx + dvdy;
+
+            xa = x[0] + dy * dxdya + (off_y * dxdya);
+            ua = u[0] + dy * dudya + (off_y * dudya);
+            va = v[0] + dy * dvdya + (off_y * dvdya);
+
+            if (compositing) {
+                if (_matting(surface)) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, true);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, 3);
+            } else if (blending) {
+                _rasterBlendingPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity);
+            } else {
+                _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, false);
+            }
+            upper = true;
+        }
+        //Draw lower segment if possibly visible
+        if (yi[1] < yi[2]) {
+            off_y = y[1] < regionTop ? (regionTop - y[1]) : 0;
+            if (!upper) xb += (off_y *dxdyb);
+
+            // Set slopes along left edge and perform subpixel pre-stepping
+            dxdya = dxdy[2];
+            dudya = dxdya * dudx + dudy;
+            dvdya = dxdya * dvdx + dvdy;
+            dy = 1 - (y[1] - yi[1]);
+            xa = x[1] + dy * dxdya + (off_y * dxdya);
+            ua = u[1] + dy * dudya + (off_y * dudya);
+            va = v[1] + dy * dvdya + (off_y * dvdya);
+
+            if (compositing) {
+                if (_matting(surface)) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, true);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, 4);
+            } else if (blending) {
+                _rasterBlendingPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity);
+            } else {
+                _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, false);
+            }
+        }
+    }
+}
+
+
+static AASpans* _AASpans(float ymin, float ymax, const SwImage* image, const SwBBox* region)
+{
+    auto yStart = static_cast<int>(ymin);
+    auto yEnd = static_cast<int>(ymax);
+
+    if (!_arrange(image, region, yStart, yEnd)) return nullptr;
+
+    auto aaSpans = static_cast<AASpans*>(malloc(sizeof(AASpans)));
+    aaSpans->yStart = yStart;
+    aaSpans->yEnd = yEnd;
+
+    //Initialize X range
+    auto height = yEnd - yStart;
+
+    aaSpans->lines = static_cast<AALine*>(malloc(height * sizeof(AALine)));
+
+    for (int32_t i = 0; i < height; i++) {
+        aaSpans->lines[i].x[0] = INT32_MAX;
+        aaSpans->lines[i].x[1] = 0;
+        aaSpans->lines[i].length[0] = 0;
+        aaSpans->lines[i].length[1] = 0;
+    }
+    return aaSpans;
+}
+
+
+static void _calcIrregularCoverage(AALine* lines, int32_t eidx, int32_t y, int32_t diagonal, int32_t edgeDist, bool reverse)
+{
+    if (eidx == 1) reverse = !reverse;
+    int32_t coverage = (255 / (diagonal + 2));
+    int32_t tmp;
+    for (int32_t ry = 0; ry < (diagonal + 2); ry++) {
+        tmp = y - ry - edgeDist;
+        if (tmp < 0) return;
+        lines[tmp].length[eidx] = 1;
+        if (reverse) lines[tmp].coverage[eidx] = 255 - (coverage * ry);
+        else lines[tmp].coverage[eidx] = (coverage * ry);
+    }
+}
+
+
+static void _calcVertCoverage(AALine *lines, int32_t eidx, int32_t y, int32_t rewind, bool reverse)
+{
+    if (eidx == 1) reverse = !reverse;
+    int32_t coverage = (255 / (rewind + 1));
+    int32_t tmp;
+    for (int ry = 1; ry < (rewind + 1); ry++) {
+        tmp = y - ry;
+        if (tmp < 0) return;
+        lines[tmp].length[eidx] = 1;
+        if (reverse) lines[tmp].coverage[eidx] = (255 - (coverage * ry));
+        else lines[tmp].coverage[eidx] = (coverage * ry);
+    }
+}
+
+
+static void _calcHorizCoverage(AALine *lines, int32_t eidx, int32_t y, int32_t x, int32_t x2)
+{
+    lines[y].length[eidx] = abs(x - x2);
+    lines[y].coverage[eidx] = (255 / (lines[y].length[eidx] + 1));
+}
+
+
+/*
+ * This Anti-Aliasing mechanism is originated from Hermet Park's idea.
+ * To understand this AA logic, you can refer this page:
+ * https://uigraphics.tistory.com/1
+*/
+static void _calcAAEdge(AASpans *aaSpans, int32_t eidx)
+{
+//Previous edge direction:
+#define DirOutHor 0x0011
+#define DirOutVer 0x0001
+#define DirInHor  0x0010
+#define DirInVer  0x0000
+#define DirNone   0x1000
+
+#define PUSH_VERTEX() \
+    do { \
+        pEdge.x = lines[y].x[eidx]; \
+        pEdge.y = y; \
+        ptx[0] = tx[0]; \
+        ptx[1] = tx[1]; \
+    } while (0)
+
+    struct Point
+    {
+        int32_t x, y;
+    };
+
+    int32_t y = 0;
+    Point pEdge = {-1, -1};       //previous edge point
+    Point edgeDiff = {0, 0};      //temporary used for point distance
+
+    /* store bigger to tx[0] between prev and current edge's x positions. */
+    int32_t tx[2] = {0, 0};
+    /* back up prev tx values */
+    int32_t ptx[2] = {0, 0};
+    int32_t diagonal = 0;           //straight diagonal pixels count
+
+    auto yStart = aaSpans->yStart;
+    auto yEnd = aaSpans->yEnd;
+    auto lines = aaSpans->lines;
+
+    int32_t prevDir = DirNone;
+    int32_t curDir = DirNone;
+
+    yEnd -= yStart;
+
+    //Start Edge
+    if (y < yEnd) {
+        pEdge.x = lines[y].x[eidx];
+        pEdge.y = y;
+    }
+
+    //Calculates AA Edges
+    for (y++; y < yEnd; y++) {
+
+        if (lines[y].x[0] == INT32_MAX) continue;
+
+        //Ready tx
+        if (eidx == 0) {
+            tx[0] = pEdge.x;
+            tx[1] = lines[y].x[0];
+        } else {
+            tx[0] = lines[y].x[1];
+            tx[1] = pEdge.x;
+        }
+        edgeDiff.x = (tx[0] - tx[1]);
+        edgeDiff.y = (y - pEdge.y);
+
+        //Confirm current edge direction
+        if (edgeDiff.x > 0) {
+            if (edgeDiff.y == 1) curDir = DirOutHor;
+            else curDir = DirOutVer;
+        } else if (edgeDiff.x < 0) {
+            if (edgeDiff.y == 1) curDir = DirInHor;
+            else curDir = DirInVer;
+        } else curDir = DirNone;
+
+        //straight diagonal increase
+        if ((curDir == prevDir) && (y < yEnd)) {
+            if ((abs(edgeDiff.x) == 1) && (edgeDiff.y == 1)) {
+                ++diagonal;
+                PUSH_VERTEX();
+                continue;
+            }
+        }
+
+        switch (curDir) {
+            case DirOutHor: {
+                _calcHorizCoverage(lines, eidx, y, tx[0], tx[1]);
+                if (diagonal > 0) {
+                    _calcIrregularCoverage(lines, eidx, y, diagonal, 0, true);
+                    diagonal = 0;
+                }
+               /* Increment direction is changed: Outside Vertical -> Outside Horizontal */
+               if (prevDir == DirOutVer) _calcHorizCoverage(lines, eidx, pEdge.y, ptx[0], ptx[1]);
+
+               //Trick, but fine-tunning!
+               if (y == 1) _calcHorizCoverage(lines, eidx, pEdge.y, tx[0], tx[1]);
+               PUSH_VERTEX();
+            }
+            break;
+            case DirOutVer: {
+                _calcVertCoverage(lines, eidx, y, edgeDiff.y, true);
+                if (diagonal > 0) {
+                    _calcIrregularCoverage(lines, eidx, y, diagonal, edgeDiff.y, false);
+                    diagonal = 0;
+                }
+               /* Increment direction is changed: Outside Horizontal -> Outside Vertical */
+               if (prevDir == DirOutHor) _calcHorizCoverage(lines, eidx, pEdge.y, ptx[0], ptx[1]);
+               PUSH_VERTEX();
+            }
+            break;
+            case DirInHor: {
+                _calcHorizCoverage(lines, eidx, (y - 1), tx[0], tx[1]);
+                if (diagonal > 0) {
+                    _calcIrregularCoverage(lines, eidx, y, diagonal, 0, false);
+                    diagonal = 0;
+                }
+                /* Increment direction is changed: Outside Horizontal -> Inside Horizontal */
+               if (prevDir == DirOutHor) _calcHorizCoverage(lines, eidx, pEdge.y, ptx[0], ptx[1]);
+               PUSH_VERTEX();
+            }
+            break;
+            case DirInVer: {
+                _calcVertCoverage(lines, eidx, y, edgeDiff.y, false);
+                if (prevDir == DirOutHor) edgeDiff.y -= 1;      //Weird, fine tuning?????????????????????
+                if (diagonal > 0) {
+                    _calcIrregularCoverage(lines, eidx, y, diagonal, edgeDiff.y, true);
+                    diagonal = 0;
+                }
+                /* Increment direction is changed: Outside Horizontal -> Inside Vertical */
+                if (prevDir == DirOutHor) _calcHorizCoverage(lines, eidx, pEdge.y, ptx[0], ptx[1]);
+                PUSH_VERTEX();
+            }
+            break;
+        }
+        if (curDir != DirNone) prevDir = curDir;
+    }
+
+    //leftovers...?
+    if ((edgeDiff.y == 1) && (edgeDiff.x != 0)) {
+        if (y >= yEnd) y = (yEnd - 1);
+        _calcHorizCoverage(lines, eidx, y - 1, ptx[0], ptx[1]);
+        _calcHorizCoverage(lines, eidx, y, tx[0], tx[1]);
+    } else {
+        ++y;
+        if (y > yEnd) y = yEnd;
+        _calcVertCoverage(lines, eidx, y, (edgeDiff.y + 1), (prevDir & 0x00000001));
+    }
+}
+
+
+static bool _apply(SwSurface* surface, AASpans* aaSpans)
+{
+    auto end = surface->buf32 + surface->h * surface->stride;
+    auto y = aaSpans->yStart;
+    uint32_t pixel;
+    uint32_t* dst;
+    int32_t pos;
+
+   //left side
+   _calcAAEdge(aaSpans, 0);
+   //right side
+   _calcAAEdge(aaSpans, 1);
+
+    while (y < aaSpans->yEnd) {
+        auto line = &aaSpans->lines[y - aaSpans->yStart];
+        auto width = line->x[1] - line->x[0];
+        if (width > 0) {
+            auto offset = y * surface->stride;
+
+            //Left edge
+            dst = surface->buf32 + (offset + line->x[0]);
+            if (line->x[0] > 1) pixel = *(dst - 1);
+            else pixel = *dst;
+            pos = 1;
+
+            //exceptional handling. out of memory bound.
+            if (dst + line->length[0] >= end) {
+                pos += (dst + line->length[0] - end);
+            }
+
+            while (pos <= line->length[0]) {
+                *dst = INTERPOLATE(*dst, pixel, line->coverage[0] * pos);
+                ++dst;
+                ++pos;
+            }
+
+            //Right edge
+            dst = surface->buf32 + offset + line->x[1] - 1;
+
+            if (line->x[1] < (int32_t)(surface->w - 1)) pixel = *(dst + 1);
+            else pixel = *dst;
+            pos = line->length[1];
+
+            //exceptional handling. out of memory bound.
+            if (dst - pos < surface->buf32) --pos;
+
+            while (pos > 0) {
+                *dst = INTERPOLATE(*dst, pixel, 255 - (line->coverage[1] * pos));
+                --dst;
+                --pos;
+            }
+        }
+        y++;
+    }
+
+    free(aaSpans->lines);
+    free(aaSpans);
+
+    return true;
+}
+
+
+/*
+    2 triangles constructs 1 mesh.
+    below figure illustrates vert[4] index info.
+    If you need better quality, please divide a mesh by more number of triangles.
+
+    0 -- 1
+    |  / |
+    | /  |
+    3 -- 2
+*/
+static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const Matrix& transform, const SwBBox* region, uint8_t opacity)
+{
+    if (surface->channelSize == sizeof(uint8_t)) {
+        TVGERR("SW_ENGINE", "Not supported grayscale Textmap polygon!");
+        return false;
+    }
+
+    //Exceptions: No dedicated drawing area?
+    if ((!image->rle && !region) || (image->rle && image->rle->size == 0)) return true;
+
+   /* Prepare vertices.
+      shift XY coordinates to match the sub-pixeling technique. */
+    Vertex vertices[4];
+    vertices[0] = {{0.0f, 0.0f}, {0.0f, 0.0f}};
+    vertices[1] = {{float(image->w), 0.0f}, {float(image->w), 0.0f}};
+    vertices[2] = {{float(image->w), float(image->h)}, {float(image->w), float(image->h)}};
+    vertices[3] = {{0.0f, float(image->h)}, {0.0f, float(image->h)}};
+
+    float ys = FLT_MAX, ye = -1.0f;
+    for (int i = 0; i < 4; i++) {
+        vertices[i].pt *= transform;
+        if (vertices[i].pt.y < ys) ys = vertices[i].pt.y;
+        if (vertices[i].pt.y > ye) ye = vertices[i].pt.y;
+    }
+
+    auto aaSpans = _AASpans(ys, ye, image, region);
+    if (!aaSpans) return true;
+
+    Polygon polygon;
+
+    //Draw the first polygon
+    polygon.vertex[0] = vertices[0];
+    polygon.vertex[1] = vertices[1];
+    polygon.vertex[2] = vertices[3];
+
+    _rasterPolygonImage(surface, image, region, polygon, aaSpans, opacity);
+
+    //Draw the second polygon
+    polygon.vertex[0] = vertices[1];
+    polygon.vertex[1] = vertices[2];
+    polygon.vertex[2] = vertices[3];
+
+    _rasterPolygonImage(surface, image, region, polygon, aaSpans, opacity);
+
+#if 0
+    if (_compositing(surface) && _masking(surface) && !_direct(surface->compositor->method)) {
+        _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+    }
+#endif
+    return _apply(surface, aaSpans);
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.cpp
@@ -0,0 +1,840 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef THORVG_SW_OPENMP_SUPPORT
+    #include <omp.h>
+#endif
+#include <algorithm>
+#include "tvgMath.h"
+#include "tvgSwCommon.h"
+#include "tvgTaskScheduler.h"
+#include "tvgSwRenderer.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+static int32_t initEngineCnt = false;
+static int32_t rendererCnt = 0;
+static SwMpool* globalMpool = nullptr;
+static uint32_t threadsCnt = 0;
+
+struct SwTask : Task
+{
+    SwSurface* surface = nullptr;
+    SwMpool* mpool = nullptr;
+    SwBBox bbox;                          //Rendering Region
+    Matrix transform;
+    Array<RenderData> clips;
+    RenderUpdateFlag flags = RenderUpdateFlag::None;
+    uint8_t opacity;
+    bool pushed = false;                  //Pushed into task list?
+    bool disposed = false;                //Disposed task?
+
+    RenderRegion bounds()
+    {
+        //Can we skip the synchronization?
+        done();
+
+        RenderRegion region;
+
+        //Range over?
+        region.x = bbox.min.x > 0 ? bbox.min.x : 0;
+        region.y = bbox.min.y > 0 ? bbox.min.y : 0;
+        region.w = bbox.max.x - region.x;
+        region.h = bbox.max.y - region.y;
+        if (region.w < 0) region.w = 0;
+        if (region.h < 0) region.h = 0;
+
+        return region;
+    }
+
+    virtual void dispose() = 0;
+    virtual bool clip(SwRle* target) = 0;
+    virtual ~SwTask() {}
+};
+
+
+struct SwShapeTask : SwTask
+{
+    SwShape shape;
+    const RenderShape* rshape = nullptr;
+    bool clipper = false;
+
+    /* We assume that if the stroke width is greater than 2,
+       the shape's outline beneath the stroke could be adequately covered by the stroke drawing.
+       Therefore, antialiasing is disabled under this condition.
+       Additionally, the stroke style should not be dashed. */
+    bool antialiasing(float strokeWidth)
+    {
+        return strokeWidth < 2.0f || rshape->stroke->dashCnt > 0 || rshape->stroke->strokeFirst || rshape->strokeTrim() || rshape->stroke->color[3] < 255;;
+    }
+
+    float validStrokeWidth()
+    {
+        if (!rshape->stroke) return 0.0f;
+
+        auto width = rshape->stroke->width;
+        if (tvg::zero(width)) return 0.0f;
+
+        if (!rshape->stroke->fill && (MULTIPLY(rshape->stroke->color[3], opacity) == 0)) return 0.0f;
+        if (tvg::zero(rshape->stroke->trim.begin - rshape->stroke->trim.end)) return 0.0f;
+
+        return (width * sqrt(transform.e11 * transform.e11 + transform.e12 * transform.e12));
+    }
+
+    bool clip(SwRle* target) override
+    {
+        if (shape.fastTrack) return rleClip(target, &bbox);
+        else if (shape.rle) return rleClip(target, shape.rle);
+        return false;
+    }
+
+    void run(unsigned tid) override
+    {
+        //Invisible
+        if (opacity == 0 && !clipper) {
+            bbox.reset();
+            return;
+        }
+
+        auto strokeWidth = validStrokeWidth();
+        SwBBox renderRegion{};
+        auto updateShape = flags & (RenderUpdateFlag::Path | RenderUpdateFlag::Transform | RenderUpdateFlag::Clip);
+        auto updateFill = false;
+
+        //Shape
+        if (updateShape || flags & (RenderUpdateFlag::Color | RenderUpdateFlag::Gradient)) {
+            uint8_t alpha = 0;
+            rshape->fillColor(nullptr, nullptr, nullptr, &alpha);
+            updateFill = (MULTIPLY(alpha, opacity) || rshape->fill);
+            if (updateShape) shapeReset(&shape);
+            if (updateFill || clipper) {
+                if (shapePrepare(&shape, rshape, transform, bbox, renderRegion, mpool, tid, clips.count > 0 ? true : false)) {
+                    if (!shapeGenRle(&shape, rshape, antialiasing(strokeWidth))) goto err;
+                } else {
+                    updateFill = false;
+                    renderRegion.reset();
+                }
+            }
+        }
+        //Fill
+        if (updateFill) {
+            if (auto fill = rshape->fill) {
+                auto ctable = (flags & RenderUpdateFlag::Gradient) ? true : false;
+                if (ctable) shapeResetFill(&shape);
+                if (!shapeGenFillColors(&shape, fill, transform, surface, opacity, ctable)) goto err;
+            }
+        }
+        //Stroke
+        if (updateShape || flags & RenderUpdateFlag::Stroke) {
+            if (strokeWidth > 0.0f) {
+                shapeResetStroke(&shape, rshape, transform);
+                if (!shapeGenStrokeRle(&shape, rshape, transform, bbox, renderRegion, mpool, tid)) goto err;
+                if (auto fill = rshape->strokeFill()) {
+                    auto ctable = (flags & RenderUpdateFlag::GradientStroke) ? true : false;
+                    if (ctable) shapeResetStrokeFill(&shape);
+                    if (!shapeGenStrokeFillColors(&shape, fill, transform, surface, opacity, ctable)) goto err;
+                }
+            } else {
+                shapeDelStroke(&shape);
+            }
+        }
+
+        //Clear current task memorypool here if the clippers would use the same memory pool
+        shapeDelOutline(&shape, mpool, tid);
+
+        //Clip Path
+        for (auto clip = clips.begin(); clip < clips.end(); ++clip) {
+            auto clipper = static_cast<SwTask*>(*clip);
+            if (shape.rle && !clipper->clip(shape.rle)) goto err;                 //Clip shape rle
+            if (shape.strokeRle && !clipper->clip(shape.strokeRle)) goto err;     //Clip stroke rle
+        }
+
+        bbox = renderRegion; //sync
+
+        return;
+
+    err:
+        bbox.reset();
+        shapeReset(&shape);
+        rleReset(shape.strokeRle);
+        shapeDelOutline(&shape, mpool, tid);
+    }
+
+    void dispose() override
+    {
+       shapeFree(&shape);
+    }
+};
+
+
+struct SwImageTask : SwTask
+{
+    SwImage image;
+    RenderSurface* source;                //Image source
+
+    bool clip(SwRle* target) override
+    {
+        TVGERR("SW_ENGINE", "Image is used as ClipPath?");
+        return true;
+    }
+
+    void run(unsigned tid) override
+    {
+        auto clipRegion = bbox;
+
+        //Convert colorspace if it's not aligned.
+        rasterConvertCS(source, surface->cs);
+        rasterPremultiply(source);
+
+        image.data = source->data;
+        image.w = source->w;
+        image.h = source->h;
+        image.stride = source->stride;
+        image.channelSize = source->channelSize;
+
+        //Invisible shape turned to visible by alpha.
+        if ((flags & (RenderUpdateFlag::Image | RenderUpdateFlag::Transform | RenderUpdateFlag::Color)) && (opacity > 0)) {
+            imageReset(&image);
+            if (!image.data || image.w == 0 || image.h == 0) goto end;
+
+            if (!imagePrepare(&image, transform, clipRegion, bbox, mpool, tid)) goto end;
+
+            if (clips.count > 0) {
+                if (!imageGenRle(&image, bbox, false)) goto end;
+                if (image.rle) {
+                    //Clear current task memorypool here if the clippers would use the same memory pool
+                    imageDelOutline(&image, mpool, tid);
+                    for (auto clip = clips.begin(); clip < clips.end(); ++clip) {
+                        auto clipper = static_cast<SwTask*>(*clip);
+                        if (!clipper->clip(image.rle)) goto err;
+                    }
+                    return;
+                }
+            }
+        }
+        goto end;
+    err:
+        rleReset(image.rle);
+    end:
+        imageDelOutline(&image, mpool, tid);
+    }
+
+    void dispose() override
+    {
+       imageFree(&image);
+    }
+};
+
+
+static void _termEngine()
+{
+    if (rendererCnt > 0) return;
+
+    mpoolTerm(globalMpool);
+    globalMpool = nullptr;
+}
+
+
+static void _renderFill(SwShapeTask* task, SwSurface* surface, uint8_t opacity)
+{
+    uint8_t r, g, b, a;
+    if (auto fill = task->rshape->fill) {
+        rasterGradientShape(surface, &task->shape, fill, opacity);
+    } else {
+        task->rshape->fillColor(&r, &g, &b, &a);
+        a = MULTIPLY(opacity, a);
+        if (a > 0) rasterShape(surface, &task->shape, r, g, b, a);
+    }
+}
+
+static void _renderStroke(SwShapeTask* task, SwSurface* surface, uint8_t opacity)
+{
+    uint8_t r, g, b, a;
+    if (auto strokeFill = task->rshape->strokeFill()) {
+        rasterGradientStroke(surface, &task->shape, strokeFill, opacity);
+    } else {
+        if (task->rshape->strokeColor(&r, &g, &b, &a)) {
+            a = MULTIPLY(opacity, a);
+            if (a > 0) rasterStroke(surface, &task->shape, r, g, b, a);
+        }
+    }
+}
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+SwRenderer::~SwRenderer()
+{
+    clearCompositors();
+
+    delete(surface);
+
+    if (!sharedMpool) mpoolTerm(mpool);
+
+    --rendererCnt;
+
+    if (rendererCnt == 0 && initEngineCnt == 0) _termEngine();
+}
+
+
+bool SwRenderer::clear()
+{
+    for (auto task = tasks.begin(); task < tasks.end(); ++task) {
+        if ((*task)->disposed) {
+            delete(*task);
+        } else {
+            (*task)->done();
+            (*task)->pushed = false;
+        }
+    }
+    tasks.clear();
+
+    if (!sharedMpool) mpoolClear(mpool);
+
+    if (surface) {
+        vport.x = vport.y = 0;
+        vport.w = surface->w;
+        vport.h = surface->h;
+    }
+
+    return true;
+}
+
+
+bool SwRenderer::sync()
+{
+    return true;
+}
+
+
+RenderRegion SwRenderer::viewport()
+{
+    return vport;
+}
+
+
+bool SwRenderer::viewport(const RenderRegion& vp)
+{
+    vport = vp;
+    return true;
+}
+
+
+bool SwRenderer::target(pixel_t* data, uint32_t stride, uint32_t w, uint32_t h, ColorSpace cs)
+{
+    if (!data || stride == 0 || w == 0 || h == 0 || w > stride) return false;
+
+    clearCompositors();
+
+    if (!surface) surface = new SwSurface;
+
+    surface->data = data;
+    surface->stride = stride;
+    surface->w = w;
+    surface->h = h;
+    surface->cs = cs;
+    surface->channelSize = CHANNEL_SIZE(cs);
+    surface->premultiplied = true;
+
+    return rasterCompositor(surface);
+}
+
+
+bool SwRenderer::preRender()
+{
+    return rasterClear(surface, 0, 0, surface->w, surface->h);
+}
+
+
+void SwRenderer::clearCompositors()
+{
+    //Free Composite Caches
+    for (auto comp = compositors.begin(); comp < compositors.end(); ++comp) {
+        free((*comp)->compositor->image.data);
+        delete((*comp)->compositor);
+        delete(*comp);
+    }
+    compositors.reset();
+}
+
+
+bool SwRenderer::postRender()
+{
+    //Unmultiply alpha if needed
+    if (surface->cs == ColorSpace::ABGR8888S || surface->cs == ColorSpace::ARGB8888S) {
+        rasterUnpremultiply(surface);
+    }
+
+    for (auto task = tasks.begin(); task < tasks.end(); ++task) {
+        if ((*task)->disposed) delete(*task);
+        else (*task)->pushed = false;
+    }
+    tasks.clear();
+
+    return true;
+}
+
+
+bool SwRenderer::renderImage(RenderData data)
+{
+    auto task = static_cast<SwImageTask*>(data);
+    task->done();
+
+    if (task->opacity == 0) return true;
+
+    return rasterImage(surface, &task->image, task->transform, task->bbox, task->opacity);
+}
+
+
+bool SwRenderer::renderShape(RenderData data)
+{
+    auto task = static_cast<SwShapeTask*>(data);
+    if (!task) return false;
+
+    task->done();
+
+    if (task->opacity == 0) return true;
+
+    //Main raster stage
+    if (task->rshape->stroke && task->rshape->stroke->strokeFirst) {
+        _renderStroke(task, surface, task->opacity);
+        _renderFill(task, surface, task->opacity);
+    } else {
+        _renderFill(task, surface, task->opacity);
+        _renderStroke(task, surface, task->opacity);
+    }
+
+    return true;
+}
+
+
+bool SwRenderer::blend(BlendMethod method)
+{
+    if (surface->blendMethod == method) return true;
+    surface->blendMethod = method;
+
+    switch (method) {
+        case BlendMethod::Normal:
+            surface->blender = nullptr;
+            break;
+        case BlendMethod::Multiply:
+            surface->blender = opBlendMultiply;
+            break;
+        case BlendMethod::Screen:
+            surface->blender = opBlendScreen;
+            break;
+        case BlendMethod::Overlay:
+            surface->blender = opBlendOverlay;
+            break;
+        case BlendMethod::Darken:
+            surface->blender = opBlendDarken;
+            break;
+        case BlendMethod::Lighten:
+            surface->blender = opBlendLighten;
+            break;
+        case BlendMethod::ColorDodge:
+            surface->blender = opBlendColorDodge;
+            break;
+        case BlendMethod::ColorBurn:
+            surface->blender = opBlendColorBurn;
+            break;
+        case BlendMethod::HardLight:
+            surface->blender = opBlendHardLight;
+            break;
+        case BlendMethod::SoftLight:
+            surface->blender = opBlendSoftLight;
+            break;
+        case BlendMethod::Difference:
+            surface->blender = opBlendDifference;
+            break;
+        case BlendMethod::Exclusion:
+            surface->blender = opBlendExclusion;
+            break;
+        case BlendMethod::Add:
+            surface->blender = opBlendAdd;
+            break;
+        default:
+            TVGLOG("SW_ENGINE", "Non supported blending option = %d", (int) method);
+            surface->blender = nullptr;
+            break;
+    }
+    return false;
+}
+
+
+RenderRegion SwRenderer::region(RenderData data)
+{
+    return static_cast<SwTask*>(data)->bounds();
+}
+
+
+bool SwRenderer::beginComposite(RenderCompositor* cmp, CompositeMethod method, uint8_t opacity)
+{
+    if (!cmp) return false;
+    auto p = static_cast<SwCompositor*>(cmp);
+
+    p->method = method;
+    p->opacity = opacity;
+
+    //Current Context?
+    if (p->method != CompositeMethod::None) {
+        surface = p->recoverSfc;
+        surface->compositor = p;
+    }
+
+    return true;
+}
+
+
+bool SwRenderer::mempool(bool shared)
+{
+    if (shared == sharedMpool) return true;
+
+    if (shared) {
+        if (!sharedMpool) {
+            if (!mpoolTerm(mpool)) return false;
+            mpool = globalMpool;
+        }
+    } else {
+        if (sharedMpool) mpool = mpoolInit(threadsCnt);
+    }
+
+    sharedMpool = shared;
+
+    if (mpool) return true;
+    return false;
+}
+
+
+const RenderSurface* SwRenderer::mainSurface()
+{
+    return surface;
+}
+
+
+SwSurface* SwRenderer::request(int channelSize, bool square)
+{
+    SwSurface* cmp = nullptr;
+    uint32_t w, h;
+
+    if (square) {
+        //Same Dimensional Size is demanded for the Post Processing Fast Flipping
+        w = h = std::max(surface->w, surface->h);
+    } else {
+        w = surface->w;
+        h = surface->h;
+    }
+
+    //Use cached data
+    for (auto p = compositors.begin(); p < compositors.end(); ++p) {
+        auto cur = *p;
+        if (cur->compositor->valid && cur->compositor->image.channelSize == channelSize) {
+            if (w == cur->w && h == cur->h) {
+                cmp = *p;
+                break;
+            }
+        }
+    }
+
+    //New Composition
+    if (!cmp) {
+        //Inherits attributes from main surface
+        cmp = new SwSurface(surface);
+        cmp->compositor = new SwCompositor;
+        cmp->compositor->image.data = (pixel_t*)malloc(channelSize * w * h);
+        cmp->w = cmp->compositor->image.w = w;
+        cmp->h = cmp->compositor->image.h = h;
+        cmp->stride = cmp->compositor->image.stride = w;
+        cmp->compositor->image.direct = true;
+        cmp->compositor->valid = true;
+        cmp->channelSize = cmp->compositor->image.channelSize = channelSize;
+
+        compositors.push(cmp);
+    }
+
+    //Sync. This may have been modified by post-processing.
+    cmp->data = cmp->compositor->image.data;
+
+    return cmp;
+}
+
+
+RenderCompositor* SwRenderer::target(const RenderRegion& region, ColorSpace cs, CompositionFlag flags)
+{
+    auto x = region.x;
+    auto y = region.y;
+    auto w = region.w;
+    auto h = region.h;
+    auto sw = static_cast<int32_t>(surface->w);
+    auto sh = static_cast<int32_t>(surface->h);
+
+    //Out of boundary
+    if (x >= sw || y >= sh || x + w < 0 || y + h < 0) return nullptr;
+
+    auto cmp = request(CHANNEL_SIZE(cs), (flags & CompositionFlag::PostProcessing));
+
+    //Boundary Check
+    if (x < 0) x = 0;
+    if (y < 0) y = 0;
+    if (x + w > sw) w = (sw - x);
+    if (y + h > sh) h = (sh - y);
+
+    if (w == 0 || h == 0) return nullptr;
+
+    cmp->compositor->recoverSfc = surface;
+    cmp->compositor->recoverCmp = surface->compositor;
+    cmp->compositor->valid = false;
+    cmp->compositor->bbox.min.x = x;
+    cmp->compositor->bbox.min.y = y;
+    cmp->compositor->bbox.max.x = x + w;
+    cmp->compositor->bbox.max.y = y + h;
+
+    /* TODO: Currently, only blending might work.
+       Blending and composition must be handled together. */
+    auto color = (surface->blender && !surface->compositor) ? 0x00ffffff : 0x00000000;
+    rasterClear(cmp, x, y, w, h, color);
+
+    //Switch render target
+    surface = cmp;
+
+    return cmp->compositor;
+}
+
+
+bool SwRenderer::endComposite(RenderCompositor* cmp)
+{
+    if (!cmp) return false;
+
+    auto p = static_cast<SwCompositor*>(cmp);
+
+    //Recover Context
+    surface = p->recoverSfc;
+    surface->compositor = p->recoverCmp;
+
+    //only invalid (currently used) surface can be composited
+    if (p->valid) return true;
+    p->valid = true;
+
+    //Default is alpha blending
+    if (p->method == CompositeMethod::None) {
+        Matrix m = {1, 0, 0, 0, 1, 0, 0, 0, 1};
+        return rasterImage(surface, &p->image, m, p->bbox, p->opacity);
+    }
+
+    return true;
+}
+
+
+void SwRenderer::prepare(RenderEffect* effect, const Matrix& transform)
+{
+    switch (effect->type) {
+        case SceneEffect::GaussianBlur: effectGaussianBlurUpdate(static_cast<RenderEffectGaussianBlur*>(effect), transform); break;
+        case SceneEffect::DropShadow: effectDropShadowUpdate(static_cast<RenderEffectDropShadow*>(effect), transform); break;
+        case SceneEffect::Fill: effectFillUpdate(static_cast<RenderEffectFill*>(effect)); break;
+        case SceneEffect::Tint: effectTintUpdate(static_cast<RenderEffectTint*>(effect)); break;
+        case SceneEffect::Tritone: effectTritoneUpdate(static_cast<RenderEffectTritone*>(effect)); break;
+        default: break;
+    }
+}
+
+
+bool SwRenderer::region(RenderEffect* effect)
+{
+    switch (effect->type) {
+        case SceneEffect::GaussianBlur: return effectGaussianBlurRegion(static_cast<RenderEffectGaussianBlur*>(effect));
+        case SceneEffect::DropShadow: return effectDropShadowRegion(static_cast<RenderEffectDropShadow*>(effect));
+        default: return false;
+    }
+}
+
+
+bool SwRenderer::render(RenderCompositor* cmp, const RenderEffect* effect, bool direct)
+{
+    auto p = static_cast<SwCompositor*>(cmp);
+
+    if (p->image.channelSize != sizeof(uint32_t)) {
+        TVGERR("SW_ENGINE", "Not supported grayscale Gaussian Blur!");
+        return false;
+    }
+
+    switch (effect->type) {
+        case SceneEffect::GaussianBlur: {
+            return effectGaussianBlur(p, request(surface->channelSize, true), static_cast<const RenderEffectGaussianBlur*>(effect));
+        }
+        case SceneEffect::DropShadow: {
+            auto cmp1 = request(surface->channelSize, true);
+            cmp1->compositor->valid = false;
+            auto cmp2 = request(surface->channelSize, true);
+            SwSurface* surfaces[] = {cmp1, cmp2};
+            auto ret = effectDropShadow(p, surfaces, static_cast<const RenderEffectDropShadow*>(effect), direct);
+            cmp1->compositor->valid = true;
+            return ret;
+        }
+        case SceneEffect::Fill: {
+            return effectFill(p, static_cast<const RenderEffectFill*>(effect), direct);
+        }
+        case SceneEffect::Tint: {
+            return effectTint(p, static_cast<const RenderEffectTint*>(effect), direct);
+        }
+        case SceneEffect::Tritone: {
+            return effectTritone(p, static_cast<const RenderEffectTritone*>(effect), direct);
+        }
+        default: return false;
+    }
+}
+
+
+ColorSpace SwRenderer::colorSpace()
+{
+    if (surface) return surface->cs;
+    else return ColorSpace::Unsupported;
+}
+
+
+void SwRenderer::dispose(RenderData data)
+{
+    auto task = static_cast<SwTask*>(data);
+    if (!task) return;
+    task->done();
+    task->dispose();
+
+    if (task->pushed) task->disposed = true;
+    else delete(task);
+}
+
+
+void* SwRenderer::prepareCommon(SwTask* task, const Matrix& transform, const Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags)
+{
+    if (!surface) return task;
+    if (flags == RenderUpdateFlag::None) return task;
+
+    //TODO: Failed threading them. It would be better if it's possible.
+    //See: https://github.com/thorvg/thorvg/issues/1409
+    //Guarantee composition targets get ready.
+    for (auto clip = clips.begin(); clip < clips.end(); ++clip) {
+        static_cast<SwTask*>(*clip)->done();
+    }
+
+    task->clips = clips;
+    task->transform = transform;
+    
+    //zero size?
+    if (task->transform.e11 == 0.0f && task->transform.e12 == 0.0f) return task; //zero width
+    if (task->transform.e21 == 0.0f && task->transform.e22 == 0.0f) return task; //zero height
+
+    task->opacity = opacity;
+    task->surface = surface;
+    task->mpool = mpool;
+    task->flags = flags;
+    task->bbox.min.x = std::max(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.x));
+    task->bbox.min.y = std::max(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.y));
+    task->bbox.max.x = std::min(static_cast<SwCoord>(surface->w), static_cast<SwCoord>(vport.x + vport.w));
+    task->bbox.max.y = std::min(static_cast<SwCoord>(surface->h), static_cast<SwCoord>(vport.y + vport.h));
+
+    if (!task->pushed) {
+        task->pushed = true;
+        tasks.push(task);
+    }
+
+    TaskScheduler::request(task);
+
+    return task;
+}
+
+
+RenderData SwRenderer::prepare(RenderSurface* surface, RenderData data, const Matrix& transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags)
+{
+    //prepare task
+    auto task = static_cast<SwImageTask*>(data);
+    if (!task) task = new SwImageTask;
+    else task->done();
+
+    task->source = surface;
+
+    return prepareCommon(task, transform, clips, opacity, flags);
+}
+
+
+RenderData SwRenderer::prepare(const RenderShape& rshape, RenderData data, const Matrix& transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags, bool clipper)
+{
+    //prepare task
+    auto task = static_cast<SwShapeTask*>(data);
+    if (!task) task = new SwShapeTask;
+    else task->done();
+
+    task->rshape = &rshape;
+    task->clipper = clipper;
+
+    return prepareCommon(task, transform, clips, opacity, flags);
+}
+
+
+SwRenderer::SwRenderer():mpool(globalMpool)
+{
+}
+
+
+bool SwRenderer::init(uint32_t threads)
+{
+    if ((initEngineCnt++) > 0) return true;
+
+    threadsCnt = threads;
+
+    //Share the memory pool among the renderer
+    globalMpool = mpoolInit(threads);
+    if (!globalMpool) {
+        --initEngineCnt;
+        return false;
+    }
+
+    return true;
+}
+
+
+int32_t SwRenderer::init()
+{
+#ifdef THORVG_SW_OPENMP_SUPPORT
+    omp_set_num_threads(TaskScheduler::threads());
+#endif
+
+    return initEngineCnt;
+}
+
+
+bool SwRenderer::term()
+{
+    if ((--initEngineCnt) > 0) return true;
+
+    initEngineCnt = 0;
+
+   _termEngine();
+
+    return true;
+}
+
+SwRenderer* SwRenderer::gen()
+{
+    ++rendererCnt;
+    return new SwRenderer();
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.h
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TVG_SW_RENDERER_H_
+#define _TVG_SW_RENDERER_H_
+
+#include "tvgRender.h"
+
+struct SwSurface;
+struct SwTask;
+struct SwCompositor;
+struct SwMpool;
+
+namespace tvg
+{
+
+class SwRenderer : public RenderMethod
+{
+public:
+    RenderData prepare(const RenderShape& rshape, RenderData data, const Matrix& transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags, bool clipper) override;
+    RenderData prepare(RenderSurface* surface, RenderData data, const Matrix& transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags) override;
+    bool preRender() override;
+    bool renderShape(RenderData data) override;
+    bool renderImage(RenderData data) override;
+    bool postRender() override;
+    void dispose(RenderData data) override;
+    RenderRegion region(RenderData data) override;
+    RenderRegion viewport() override;
+    bool viewport(const RenderRegion& vp) override;
+    bool blend(BlendMethod method) override;
+    ColorSpace colorSpace() override;
+    const RenderSurface* mainSurface() override;
+
+    bool clear() override;
+    bool sync() override;
+    bool target(pixel_t* data, uint32_t stride, uint32_t w, uint32_t h, ColorSpace cs);
+    bool mempool(bool shared);
+
+    RenderCompositor* target(const RenderRegion& region, ColorSpace cs, CompositionFlag flags) override;
+    bool beginComposite(RenderCompositor* cmp, CompositeMethod method, uint8_t opacity) override;
+    bool endComposite(RenderCompositor* cmp) override;
+    void clearCompositors();
+
+    void prepare(RenderEffect* effect, const Matrix& transform) override;
+    bool region(RenderEffect* effect) override;
+    bool render(RenderCompositor* cmp, const RenderEffect* effect, bool direct) override;
+
+    static SwRenderer* gen();
+    static bool init(uint32_t threads);
+    static int32_t init();
+    static bool term();
+
+private:
+    SwSurface*           surface = nullptr;           //active surface
+    Array<SwTask*>       tasks;                       //async task list
+    Array<SwSurface*>    compositors;                 //render targets cache list
+    SwMpool*             mpool;                       //private memory pool
+    RenderRegion         vport;                       //viewport
+    bool                 sharedMpool = true;          //memory-pool behavior policy
+
+    SwRenderer();
+    ~SwRenderer();
+
+    SwSurface* request(int channelSize, bool square);
+    RenderData prepareCommon(SwTask* task, const Matrix& transform, const Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags);
+};
+
+}
+
+#endif /* _TVG_SW_RENDERER_H_ */
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRle.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRle.cpp
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgSwCommon.h"
+#include "tvgMath.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+static bool _outlineBegin(SwOutline& outline)
+{
+    //Make a contour if lineTo/curveTo without calling close or moveTo beforehand.
+    if (outline.pts.empty()) return false;
+    outline.cntrs.push(outline.pts.count - 1);
+    outline.closed.push(false);
+    outline.pts.push(outline.pts[outline.cntrs.last()]);
+    outline.types.push(SW_CURVE_TYPE_POINT);
+    return false;
+}
+
+
+static bool _outlineEnd(SwOutline& outline)
+{
+    if (outline.pts.empty()) return false;
+    outline.cntrs.push(outline.pts.count - 1);
+    outline.closed.push(false);
+    return false;
+}
+
+
+static bool _outlineMoveTo(SwOutline& outline, const Point* to, const Matrix& transform, bool closed = false)
+{
+    //make it a contour, if the last contour is not closed yet.
+    if (!closed) _outlineEnd(outline);
+
+    outline.pts.push(mathTransform(to, transform));
+    outline.types.push(SW_CURVE_TYPE_POINT);
+    return false;
+}
+
+
+static void _outlineLineTo(SwOutline& outline, const Point* to, const Matrix& transform)
+{
+    outline.pts.push(mathTransform(to, transform));
+    outline.types.push(SW_CURVE_TYPE_POINT);
+}
+
+
+static void _outlineCubicTo(SwOutline& outline, const Point* ctrl1, const Point* ctrl2, const Point* to, const Matrix& transform)
+{
+    outline.pts.push(mathTransform(ctrl1, transform));
+    outline.types.push(SW_CURVE_TYPE_CUBIC);
+
+    outline.pts.push(mathTransform(ctrl2, transform));
+    outline.types.push(SW_CURVE_TYPE_CUBIC);    
+
+    outline.pts.push(mathTransform(to, transform));
+    outline.types.push(SW_CURVE_TYPE_POINT);
+}
+
+
+static bool _outlineClose(SwOutline& outline)
+{
+    uint32_t i;
+    if (outline.cntrs.count > 0) i = outline.cntrs.last() + 1;
+    else i = 0;
+
+    //Make sure there is at least one point in the current path
+    if (outline.pts.count == i) return false;
+
+    //Close the path
+    outline.pts.push(outline.pts[i]);
+    outline.cntrs.push(outline.pts.count - 1);
+    outline.types.push(SW_CURVE_TYPE_POINT);
+    outline.closed.push(true);
+
+    return true;
+}
+
+
+static void _dashLineTo(SwDashStroke& dash, const Point* to, const Matrix& transform)
+{
+    Line cur = {dash.ptCur, *to};
+    auto len = cur.length();
+    if (tvg::zero(len)) {
+        _outlineMoveTo(*dash.outline, &dash.ptCur, transform);
+    //draw the current line fully
+    } else if (len <= dash.curLen) {
+        dash.curLen -= len;
+        if (!dash.curOpGap) {
+            if (dash.move) {
+                _outlineMoveTo(*dash.outline, &dash.ptCur, transform);
+                dash.move = false;
+            }
+            _outlineLineTo(*dash.outline, to, transform);
+        }
+    //draw the current line partially
+    } else {
+        while (len - dash.curLen > DASH_PATTERN_THRESHOLD) {
+            Line left, right;
+            if (dash.curLen > 0) {
+                len -= dash.curLen;
+                cur.split(dash.curLen, left, right);
+                if (!dash.curOpGap) {
+                    if (dash.move || dash.pattern[dash.curIdx] - dash.curLen < FLOAT_EPSILON) {
+                        _outlineMoveTo(*dash.outline, &left.pt1, transform);
+                        dash.move = false;
+                    }
+                    _outlineLineTo(*dash.outline, &left.pt2, transform);
+                }
+            } else {
+                right = cur;
+            }
+            dash.curIdx = (dash.curIdx + 1) % dash.cnt;
+            dash.curLen = dash.pattern[dash.curIdx];
+            dash.curOpGap = !dash.curOpGap;
+            cur = right;
+            dash.ptCur = cur.pt1;
+            dash.move = true;
+        }
+        //leftovers
+        dash.curLen -= len;
+        if (!dash.curOpGap) {
+            if (dash.move) {
+                _outlineMoveTo(*dash.outline, &cur.pt1, transform);
+                dash.move = false;
+            }
+            _outlineLineTo(*dash.outline, &cur.pt2, transform);
+        }
+        if (dash.curLen < 1 && TO_SWCOORD(len) > 1) {
+            //move to next dash
+            dash.curIdx = (dash.curIdx + 1) % dash.cnt;
+            dash.curLen = dash.pattern[dash.curIdx];
+            dash.curOpGap = !dash.curOpGap;
+        }
+    }
+    dash.ptCur = *to;
+}
+
+
+static void _dashCubicTo(SwDashStroke& dash, const Point* ctrl1, const Point* ctrl2, const Point* to, const Matrix& transform)
+{
+    Bezier cur = {dash.ptCur, *ctrl1, *ctrl2, *to};
+    auto len = cur.length();
+
+    //draw the current line fully
+    if (tvg::zero(len)) {
+        _outlineMoveTo(*dash.outline, &dash.ptCur, transform);
+    } else if (len <= dash.curLen) {
+        dash.curLen -= len;
+        if (!dash.curOpGap) {
+            if (dash.move) {
+                _outlineMoveTo(*dash.outline, &dash.ptCur, transform);
+                dash.move = false;
+            }
+            _outlineCubicTo(*dash.outline, ctrl1, ctrl2, to, transform);
+        }
+    //draw the current line partially
+    } else {
+        while ((len - dash.curLen) > DASH_PATTERN_THRESHOLD) {
+            Bezier left, right;
+            if (dash.curLen > 0) {
+                len -= dash.curLen;
+                cur.split(dash.curLen, left, right);
+                if (!dash.curOpGap) {
+                    if (dash.move || dash.pattern[dash.curIdx] - dash.curLen < FLOAT_EPSILON) {
+                        _outlineMoveTo(*dash.outline, &left.start, transform);
+                        dash.move = false;
+                    }
+                    _outlineCubicTo(*dash.outline, &left.ctrl1, &left.ctrl2, &left.end, transform);
+                }
+            } else {
+                right = cur;
+            }
+            dash.curIdx = (dash.curIdx + 1) % dash.cnt;
+            dash.curLen = dash.pattern[dash.curIdx];
+            dash.curOpGap = !dash.curOpGap;
+            cur = right;
+            dash.ptCur = right.start;
+            dash.move = true;
+        }
+        //leftovers
+        dash.curLen -= len;
+        if (!dash.curOpGap) {
+            if (dash.move) {
+                _outlineMoveTo(*dash.outline, &cur.start, transform);
+                dash.move = false;
+            }
+            _outlineCubicTo(*dash.outline, &cur.ctrl1, &cur.ctrl2, &cur.end, transform);
+        }
+        if (dash.curLen < 0.1f && TO_SWCOORD(len) > 1) {
+            //move to next dash
+            dash.curIdx = (dash.curIdx + 1) % dash.cnt;
+            dash.curLen = dash.pattern[dash.curIdx];
+            dash.curOpGap = !dash.curOpGap;
+        }
+    }
+    dash.ptCur = *to;
+}
+
+
+static void _dashClose(SwDashStroke& dash, const Matrix& transform)
+{
+    _dashLineTo(dash, &dash.ptStart, transform);
+}
+
+
+static void _dashMoveTo(SwDashStroke& dash, const Point* pts)
+{
+    dash.ptCur = *pts;
+    dash.ptStart = *pts;
+    dash.move = true;
+}
+
+
+static void _dashMoveTo(SwDashStroke& dash, uint32_t offIdx, float offset, const Point* pts)
+{
+    dash.curIdx = offIdx % dash.cnt;
+    dash.curLen = dash.pattern[dash.curIdx] - offset;
+    dash.curOpGap = offIdx % 2;
+    dash.ptStart = dash.ptCur = *pts;
+    dash.move = true;
+}
+
+
+static void _trimPattern(SwDashStroke* dash, const RenderShape* rshape, float length, float trimBegin, float trimEnd)
+{
+    auto begin = length * trimBegin;
+    auto end = length * trimEnd;
+
+    //default
+    if (end > begin) {
+        if (begin > 0.0f) dash->cnt = 4;
+        else dash->cnt = 2;
+        //looping
+    } else dash->cnt = 3;
+
+    if (dash->cnt == 2) {
+        dash->pattern[0] = end - begin;
+        dash->pattern[1] = length - (end - begin);
+    } else if (dash->cnt == 3) {
+        dash->pattern[0] = end;
+        dash->pattern[1] = (begin - end);
+        dash->pattern[2] = length - begin;
+    } else {
+        dash->pattern[0] = 0;     //zero dash to start with a space.
+        dash->pattern[1] = begin;
+        dash->pattern[2] = end - begin;
+        dash->pattern[3] = length - end;
+    }
+}
+
+
+static float _outlineLength(const RenderShape* rshape, uint32_t shiftPts, uint32_t shiftCmds, bool subpath)
+{
+    const PathCommand* cmds = rshape->path.cmds.data + shiftCmds;
+    auto cmdCnt = rshape->path.cmds.count - shiftCmds;
+    const Point* pts = rshape->path.pts.data + shiftPts;
+    auto ptsCnt = rshape->path.pts.count - shiftPts;
+
+    //No actual shape data
+    if (cmdCnt <= 0 || ptsCnt <= 0) return 0.0f;
+
+    const Point* close = nullptr;
+    auto len = 0.0f;
+
+    //must begin with moveTo
+    if (cmds[0] == PathCommand::MoveTo) {
+        close = pts;
+        cmds++;
+        pts++;
+        cmdCnt--;
+    }
+
+    while (cmdCnt-- > 0) {
+        switch (*cmds) {
+            case PathCommand::Close: {
+                len += length(pts - 1, close);
+                if (subpath) return len;
+                break;
+            }
+            case PathCommand::MoveTo: {
+                if (subpath) return len;
+                close = pts;
+                ++pts;
+                break;
+            }
+            case PathCommand::LineTo: {
+                len += length(pts - 1, pts);
+                ++pts;
+                break;
+            }
+            case PathCommand::CubicTo: {
+                len += Bezier{*(pts - 1), *pts, *(pts + 1), *(pts + 2)}.length();
+                pts += 3;
+                break;
+            }
+        }
+        ++cmds;
+    }
+    return len;
+}
+
+
+static SwOutline* _genDashOutline(const RenderShape* rshape, const Matrix& transform, bool trimmed, SwMpool* mpool, unsigned tid)
+{
+    const PathCommand* cmds = rshape->path.cmds.data;
+    auto cmdCnt = rshape->path.cmds.count;
+    const Point* pts = rshape->path.pts.data;
+    auto ptsCnt = rshape->path.pts.count;
+
+    //No actual shape data
+    if (cmdCnt == 0 || ptsCnt == 0) return nullptr;
+
+    auto startPts = pts;
+    auto startCmds = cmds;
+
+    SwDashStroke dash;
+    auto offset = 0.0f;
+    dash.cnt = rshape->strokeDash((const float**)&dash.pattern, &offset);
+    auto simultaneous = rshape->stroke->trim.simultaneous;
+    float trimBegin = 0.0f, trimEnd = 1.0f;
+    if (trimmed) rshape->stroke->strokeTrim(trimBegin, trimEnd);
+
+    if (dash.cnt == 0) {
+        if (trimmed) dash.pattern = (float*)malloc(sizeof(float) * 4);
+        else return nullptr;
+    } else {
+        //TODO: handle dash + trim - for now trimming ignoring is forced
+        trimmed = false;
+    }
+
+    //offset
+    auto patternLength = 0.0f;
+    uint32_t offIdx = 0;
+    if (!tvg::zero(offset)) {
+        for (size_t i = 0; i < dash.cnt; ++i) patternLength += dash.pattern[i];
+        bool isOdd = dash.cnt % 2;
+        if (isOdd) patternLength *= 2;
+
+        offset = fmodf(offset, patternLength);
+        if (offset < 0) offset += patternLength;
+
+        for (size_t i = 0; i < dash.cnt * (1 + (size_t)isOdd); ++i, ++offIdx) {
+            auto curPattern = dash.pattern[i % dash.cnt];
+            if (offset < curPattern) break;
+            offset -= curPattern;
+        }
+    }
+
+    dash.outline = mpoolReqDashOutline(mpool, tid);
+
+    //must begin with moveTo
+    if (cmds[0] == PathCommand::MoveTo) {
+        if (trimmed) _trimPattern(&dash, rshape, _outlineLength(rshape, 0, 0, simultaneous), trimBegin, trimEnd);
+        _dashMoveTo(dash, offIdx, offset, pts);
+        cmds++;
+        pts++;
+    }
+
+    while (--cmdCnt > 0) {
+        switch (*cmds) {
+            case PathCommand::Close: {
+                _dashClose(dash, transform);
+                break;
+            }
+            case PathCommand::MoveTo: {
+                if (trimmed) {
+                    if (simultaneous) {
+                        _trimPattern(&dash, rshape, _outlineLength(rshape, pts - startPts, cmds - startCmds, true), trimBegin, trimEnd);
+                        _dashMoveTo(dash, offIdx, offset, pts);
+                    } else _dashMoveTo(dash, pts);
+                } else _dashMoveTo(dash, offIdx, offset, pts);
+                ++pts;
+                break;
+            }
+            case PathCommand::LineTo: {
+                _dashLineTo(dash, pts, transform);
+                ++pts;
+                break;
+            }
+            case PathCommand::CubicTo: {
+                _dashCubicTo(dash, pts, pts + 1, pts + 2, transform);
+                pts += 3;
+                break;
+            }
+        }
+        ++cmds;
+    }
+
+    _outlineEnd(*dash.outline);
+
+    if (trimmed) free(dash.pattern);
+
+    return dash.outline;
+}
+
+
+static bool _axisAlignedRect(const SwOutline* outline)
+{
+    //Fast Track: axis-aligned rectangle?
+    if (outline->pts.count != 5) return false;
+    if (outline->types[2] == SW_CURVE_TYPE_CUBIC) return false;
+
+    auto pt1 = outline->pts.data + 0;
+    auto pt2 = outline->pts.data + 1;
+    auto pt3 = outline->pts.data + 2;
+    auto pt4 = outline->pts.data + 3;
+
+    auto a = SwPoint{pt1->x, pt3->y};
+    auto b = SwPoint{pt3->x, pt1->y};
+
+    if ((*pt2 == a && *pt4 == b) || (*pt2 == b && *pt4 == a)) return true;
+
+    return false;
+}
+
+
+static bool _genOutline(SwShape* shape, const RenderShape* rshape, const Matrix& transform, SwMpool* mpool, unsigned tid, bool hasComposite)
+{
+    const PathCommand* cmds = rshape->path.cmds.data;
+    auto cmdCnt = rshape->path.cmds.count;
+    const Point* pts = rshape->path.pts.data;
+    auto ptsCnt = rshape->path.pts.count;
+
+    //No actual shape data
+    if (cmdCnt == 0 || ptsCnt == 0) return false;
+
+    shape->outline = mpoolReqOutline(mpool, tid);
+    auto outline = shape->outline;
+    auto closed = false;
+
+    //Generate Outlines
+    while (cmdCnt-- > 0) {
+        switch (*cmds) {
+            case PathCommand::Close: {
+                if (!closed) closed = _outlineClose(*outline);
+                break;
+            }
+            case PathCommand::MoveTo: {
+                closed = _outlineMoveTo(*outline, pts, transform, closed);
+                ++pts;
+                break;
+            }
+            case PathCommand::LineTo: {
+                if (closed) closed = _outlineBegin(*outline);
+                _outlineLineTo(*outline, pts, transform);
+                ++pts;
+                break;
+            }
+            case PathCommand::CubicTo: {
+                if (closed) closed = _outlineBegin(*outline);
+                _outlineCubicTo(*outline, pts, pts + 1, pts + 2, transform);
+                pts += 3;
+                break;
+            }
+        }
+        ++cmds;
+    }
+
+    if (!closed) _outlineEnd(*outline);
+
+    outline->fillRule = rshape->rule;
+    shape->outline = outline;
+
+    shape->fastTrack = (!hasComposite && _axisAlignedRect(shape->outline));
+    return true;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+bool shapePrepare(SwShape* shape, const RenderShape* rshape, const Matrix& transform,  const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid, bool hasComposite)
+{
+    if (!_genOutline(shape, rshape, transform, mpool, tid, hasComposite)) return false;
+    if (!mathUpdateOutlineBBox(shape->outline, clipRegion, renderRegion, shape->fastTrack)) return false;
+
+    shape->bbox = renderRegion;
+
+    //Check valid region
+    if (renderRegion.max.x - renderRegion.min.x < 1 && renderRegion.max.y - renderRegion.min.y < 1) return false;
+
+    //Check boundary
+    if (renderRegion.min.x >= clipRegion.max.x || renderRegion.min.y >= clipRegion.max.y ||
+        renderRegion.max.x <= clipRegion.min.x || renderRegion.max.y <= clipRegion.min.y) return false;
+
+    return true;
+}
+
+
+bool shapePrepared(const SwShape* shape)
+{
+    return shape->rle ? true : false;
+}
+
+
+bool shapeGenRle(SwShape* shape, TVG_UNUSED const RenderShape* rshape, bool antiAlias)
+{
+    //FIXME: Should we draw it?
+    //Case: Stroke Line
+    //if (shape.outline->opened) return true;
+
+    //Case A: Fast Track Rectangle Drawing
+    if (shape->fastTrack) return true;
+
+    //Case B: Normal Shape RLE Drawing
+    if ((shape->rle = rleRender(shape->rle, shape->outline, shape->bbox, antiAlias))) return true;
+
+    return false;
+}
+
+
+void shapeDelOutline(SwShape* shape, SwMpool* mpool, uint32_t tid)
+{
+    mpoolRetOutline(mpool, tid);
+    shape->outline = nullptr;
+}
+
+
+void shapeReset(SwShape* shape)
+{
+    rleReset(shape->rle);
+    shape->fastTrack = false;
+    shape->bbox.reset();
+}
+
+
+void shapeFree(SwShape* shape)
+{
+    rleFree(shape->rle);
+    shape->rle = nullptr;
+
+    shapeDelFill(shape);
+
+    if (shape->stroke) {
+        rleFree(shape->strokeRle);
+        shape->strokeRle = nullptr;
+        strokeFree(shape->stroke);
+        shape->stroke = nullptr;
+    }
+}
+
+
+void shapeDelStroke(SwShape* shape)
+{
+    if (!shape->stroke) return;
+    rleFree(shape->strokeRle);
+    shape->strokeRle = nullptr;
+    strokeFree(shape->stroke);
+    shape->stroke = nullptr;
+}
+
+
+void shapeResetStroke(SwShape* shape, const RenderShape* rshape, const Matrix& transform)
+{
+    if (!shape->stroke) shape->stroke = static_cast<SwStroke*>(calloc(1, sizeof(SwStroke)));
+    auto stroke = shape->stroke;
+    if (!stroke) return;
+
+    strokeReset(stroke, rshape, transform);
+    rleReset(shape->strokeRle);
+}
+
+
+bool shapeGenStrokeRle(SwShape* shape, const RenderShape* rshape, const Matrix& transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid)
+{
+    SwOutline* shapeOutline = nullptr;
+    SwOutline* strokeOutline = nullptr;
+    auto dashStroking = false;
+    auto ret = true;
+
+    //Dash style (+trimming)
+    auto trimmed = rshape->strokeTrim();
+    if (rshape->stroke->dashCnt > 0 || trimmed) {
+        shapeOutline = _genDashOutline(rshape, transform, trimmed, mpool, tid);
+        if (!shapeOutline) return false;
+        dashStroking = true;
+    //Normal style
+    } else {
+        if (!shape->outline) {
+            if (!_genOutline(shape, rshape, transform, mpool, tid, false)) return false;
+        }
+        shapeOutline = shape->outline;
+    }
+
+    if (!strokeParseOutline(shape->stroke, *shapeOutline)) {
+        ret = false;
+        goto clear;
+    }
+
+    strokeOutline = strokeExportOutline(shape->stroke, mpool, tid);
+
+    if (!mathUpdateOutlineBBox(strokeOutline, clipRegion, renderRegion, false)) {
+        ret = false;
+        goto clear;
+    }
+
+    shape->strokeRle = rleRender(shape->strokeRle, strokeOutline, renderRegion, true);
+
+clear:
+    if (dashStroking) mpoolRetDashOutline(mpool, tid);
+    mpoolRetStrokeOutline(mpool, tid);
+
+    return ret;
+}
+
+
+bool shapeGenFillColors(SwShape* shape, const Fill* fill, const Matrix& transform, SwSurface* surface, uint8_t opacity, bool ctable)
+{
+    return fillGenColorTable(shape->fill, fill, transform, surface, opacity, ctable);
+}
+
+
+bool shapeGenStrokeFillColors(SwShape* shape, const Fill* fill, const Matrix& transform, SwSurface* surface, uint8_t opacity, bool ctable)
+{
+    return fillGenColorTable(shape->stroke->fill, fill, transform, surface, opacity, ctable);
+}
+
+
+void shapeResetFill(SwShape* shape)
+{
+    if (!shape->fill) {
+        shape->fill = static_cast<SwFill*>(calloc(1, sizeof(SwFill)));
+        if (!shape->fill) return;
+    }
+    fillReset(shape->fill);
+}
+
+
+void shapeResetStrokeFill(SwShape* shape)
+{
+    if (!shape->stroke->fill) {
+        shape->stroke->fill = static_cast<SwFill*>(calloc(1, sizeof(SwFill)));
+        if (!shape->stroke->fill) return;
+    }
+    fillReset(shape->stroke->fill);
+}
+
+
+void shapeDelFill(SwShape* shape)
+{
+    if (!shape->fill) return;
+    fillFree(shape->fill);
+    shape->fill = nullptr;
+}
+
+
+void shapeDelStrokeFill(SwShape* shape)
+{
+    if (!shape->stroke->fill) return;
+    fillFree(shape->stroke->fill);
+    shape->stroke->fill = nullptr;
+}
--- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwStroke.cpp
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwStroke.cpp
@@ -0,0 +1,913 @@
+/*
+ * Copyright (c) 2020 - 2024 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <string.h>
+#include <math.h>
+#include "tvgSwCommon.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+static constexpr auto SW_STROKE_TAG_POINT = 1;
+static constexpr auto SW_STROKE_TAG_CUBIC = 2;
+static constexpr auto SW_STROKE_TAG_BEGIN = 4;
+static constexpr auto SW_STROKE_TAG_END = 8;
+
+static inline SwFixed SIDE_TO_ROTATE(const int32_t s)
+{
+    return (SW_ANGLE_PI2 - static_cast<SwFixed>(s) * SW_ANGLE_PI);
+}
+
+
+static inline void SCALE(const SwStroke& stroke, SwPoint& pt)
+{
+    pt.x = static_cast<SwCoord>(pt.x * stroke.sx);
+    pt.y = static_cast<SwCoord>(pt.y * stroke.sy);
+}
+
+
+static void _growBorder(SwStrokeBorder* border, uint32_t newPts)
+{
+    auto maxOld = border->maxPts;
+    auto maxNew = border->ptsCnt + newPts;
+
+    if (maxNew <= maxOld) return;
+
+    auto maxCur = maxOld;
+
+    while (maxCur < maxNew)
+        maxCur += (maxCur >> 1) + 16;
+    //OPTIMIZE: use mempool!
+    border->pts = static_cast<SwPoint*>(realloc(border->pts, maxCur * sizeof(SwPoint)));
+    border->tags = static_cast<uint8_t*>(realloc(border->tags, maxCur * sizeof(uint8_t)));
+    border->maxPts = maxCur;
+}
+
+
+static void _borderClose(SwStrokeBorder* border, bool reverse)
+{
+    auto start = border->start;
+    auto count = border->ptsCnt;
+
+    //Don't record empty paths!
+    if (count <= start + 1U) {
+        border->ptsCnt = start;
+    } else {
+        /* Copy the last point to the start of this sub-path,
+           since it contains the adjusted starting coordinates */
+        border->ptsCnt = --count;
+        border->pts[start] = border->pts[count];
+
+        if (reverse) {
+            //reverse the points
+            auto pt1 = border->pts + start + 1;
+            auto pt2 = border->pts + count - 1;
+
+            while (pt1 < pt2) {
+                auto tmp = *pt1;
+                *pt1 = *pt2;
+                *pt2 = tmp;
+                ++pt1;
+                --pt2;
+            }
+
+            //reverse the tags
+            auto tag1 = border->tags + start + 1;
+            auto tag2 = border->tags + count - 1;
+
+            while (tag1 < tag2) {
+                auto tmp = *tag1;
+                *tag1 = *tag2;
+                *tag2 = tmp;
+                ++tag1;
+                --tag2;
+            }
+        }
+
+        border->tags[start] |= SW_STROKE_TAG_BEGIN;
+        border->tags[count - 1] |=  SW_STROKE_TAG_END;
+    }
+
+    border->start = -1;
+    border->movable = false;
+}
+
+
+static void _borderCubicTo(SwStrokeBorder* border, const SwPoint& ctrl1, const SwPoint& ctrl2, const SwPoint& to)
+{
+    _growBorder(border, 3);
+
+    auto pt = border->pts + border->ptsCnt;
+    auto tag = border->tags + border->ptsCnt;
+
+    pt[0] = ctrl1;
+    pt[1] = ctrl2;
+    pt[2] = to;
+
+    tag[0] = SW_STROKE_TAG_CUBIC;
+    tag[1] = SW_STROKE_TAG_CUBIC;
+    tag[2] = SW_STROKE_TAG_POINT;
+
+    border->ptsCnt += 3;
+    border->movable = false;
+}
+
+
+static void _borderArcTo(SwStrokeBorder* border, const SwPoint& center, SwFixed radius, SwFixed angleStart, SwFixed angleDiff, SwStroke& stroke)
+{
+    constexpr SwFixed ARC_CUBIC_ANGLE = SW_ANGLE_PI / 2;
+    SwPoint a = {static_cast<SwCoord>(radius), 0};
+    mathRotate(a, angleStart);
+    SCALE(stroke, a);
+    a += center;
+
+    auto total = angleDiff;
+    auto angle = angleStart;
+    auto rotate = (angleDiff >= 0) ? SW_ANGLE_PI2 : -SW_ANGLE_PI2;
+
+    while (total != 0) {
+        auto step = total;
+        if (step > ARC_CUBIC_ANGLE) step = ARC_CUBIC_ANGLE;
+        else if (step < -ARC_CUBIC_ANGLE) step = -ARC_CUBIC_ANGLE;
+
+        auto next = angle + step;
+        auto theta = step;
+        if (theta < 0) theta = -theta;
+
+        theta >>= 1;
+
+        //compute end point
+        SwPoint b = {static_cast<SwCoord>(radius), 0};
+        mathRotate(b, next);
+        SCALE(stroke, b);
+        b += center;
+
+        //compute first and second control points
+        auto length = mathMulDiv(radius, mathSin(theta) * 4, (0x10000L + mathCos(theta)) * 3);
+
+        SwPoint a2 = {static_cast<SwCoord>(length), 0};
+        mathRotate(a2, angle + rotate);
+        SCALE(stroke, a2);
+        a2 += a;
+
+        SwPoint b2 = {static_cast<SwCoord>(length), 0};
+        mathRotate(b2, next - rotate);
+        SCALE(stroke, b2);
+        b2 += b;
+
+        //add cubic arc
+        _borderCubicTo(border, a2, b2, b);
+
+        //process the rest of the arc?
+        a = b;
+        total -= step;
+        angle = next;
+    }
+}
+
+
+static void _borderLineTo(SwStrokeBorder* border, const SwPoint& to, bool movable)
+{
+    if (border->movable) {
+        //move last point
+        border->pts[border->ptsCnt - 1] = to;
+    } else {
+        //don't add zero-length line_to
+        if (border->ptsCnt > 0 && (border->pts[border->ptsCnt - 1] - to).small()) return;
+
+        _growBorder(border, 1);
+        border->pts[border->ptsCnt] = to;
+        border->tags[border->ptsCnt] = SW_STROKE_TAG_POINT;
+        border->ptsCnt += 1;
+    }
+
+    border->movable = movable;
+}
+
+
+static void _borderMoveTo(SwStrokeBorder* border, SwPoint& to)
+{
+    //close current open path if any?
+    if (border->start >= 0) _borderClose(border, false);
+
+    border->start = border->ptsCnt;
+    border->movable = false;
+
+    _borderLineTo(border, to, false);
+}
+
+
+static void _arcTo(SwStroke& stroke, int32_t side)
+{
+    auto border = stroke.borders + side;
+    auto rotate = SIDE_TO_ROTATE(side);
+    auto total = mathDiff(stroke.angleIn, stroke.angleOut);
+    if (total == SW_ANGLE_PI) total = -rotate * 2;
+
+    _borderArcTo(border, stroke.center, stroke.width, stroke.angleIn + rotate, total, stroke);
+    border->movable = false;
+}
+
+
+static void _outside(SwStroke& stroke, int32_t side, SwFixed lineLength)
+{
+    auto border = stroke.borders + side;
+
+    if (stroke.join == StrokeJoin::Round) {
+        _arcTo(stroke, side);
+    } else {
+        //this is a mitered (pointed) or beveled (truncated) corner
+        auto rotate = SIDE_TO_ROTATE(side);
+        auto bevel = stroke.join == StrokeJoin::Bevel;
+        SwFixed phi = 0;
+        SwFixed thcos = 0;
+
+        if (!bevel) {
+            auto theta = mathDiff(stroke.angleIn, stroke.angleOut);
+            if (theta == SW_ANGLE_PI) {
+                theta = rotate;
+                phi = stroke.angleIn;
+            } else {
+                theta /= 2;
+                phi = stroke.angleIn + theta + rotate;
+            }
+
+            thcos = mathCos(theta);
+            auto sigma = mathMultiply(stroke.miterlimit, thcos);
+
+            //is miter limit exceeded?
+            if (sigma < 0x10000L) bevel = true;
+        }
+
+        //this is a bevel (broken angle)
+        if (bevel) {
+            SwPoint delta = {static_cast<SwCoord>(stroke.width), 0};
+            mathRotate(delta, stroke.angleOut + rotate);
+            SCALE(stroke, delta);
+            delta += stroke.center;
+            border->movable = false;
+            _borderLineTo(border, delta, false);
+        //this is a miter (intersection)
+        } else {
+            auto length = mathDivide(stroke.width, thcos);
+            SwPoint delta = {static_cast<SwCoord>(length), 0};
+            mathRotate(delta, phi);
+            SCALE(stroke, delta);
+            delta += stroke.center;
+            _borderLineTo(border, delta, false);
+
+            /* Now add and end point
+               Only needed if not lineto (lineLength is zero for curves) */
+            if (lineLength == 0) {
+                delta = {static_cast<SwCoord>(stroke.width), 0};
+                mathRotate(delta, stroke.angleOut + rotate);
+                SCALE(stroke, delta);
+                delta += stroke.center;
+                _borderLineTo(border, delta, false);
+            }
+        }
+    }
+}
+
+
+static void _inside(SwStroke& stroke, int32_t side, SwFixed lineLength)
+{
+    auto border = stroke.borders + side;
+    auto theta = mathDiff(stroke.angleIn, stroke.angleOut) / 2;
+    SwPoint delta;
+    bool intersect = false;
+
+    /* Only intersect borders if between two line_to's and both
+       lines are long enough (line length is zero for curves). */
+    if (border->movable && lineLength > 0) {
+        //compute minimum required length of lines
+        SwFixed minLength = abs(mathMultiply(stroke.width, mathTan(theta)));
+        if (stroke.lineLength >= minLength && lineLength >= minLength) intersect = true;
+    }
+
+    auto rotate = SIDE_TO_ROTATE(side);
+
+    if (!intersect) {
+        delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, stroke.angleOut + rotate);
+        SCALE(stroke, delta);
+        delta += stroke.center;
+        border->movable = false;
+    } else {
+        //compute median angle
+        auto phi = stroke.angleIn + theta;
+        auto thcos = mathCos(theta);
+        delta = {static_cast<SwCoord>(mathDivide(stroke.width, thcos)), 0};
+        mathRotate(delta, phi + rotate);
+        SCALE(stroke, delta);
+        delta += stroke.center;
+    }
+
+    _borderLineTo(border, delta, false);
+}
+
+
+void _processCorner(SwStroke& stroke, SwFixed lineLength)
+{
+    auto turn = mathDiff(stroke.angleIn, stroke.angleOut);
+
+    //no specific corner processing is required if the turn is 0
+    if (turn == 0) return;
+
+    //when we turn to the right, the inside side is 0
+    int32_t inside = 0;
+
+    //otherwise, the inside is 1
+    if (turn < 0) inside = 1;
+
+    //process the inside
+    _inside(stroke, inside, lineLength);
+
+    //process the outside
+    _outside(stroke, 1 - inside, lineLength);
+}
+
+
+void _firstSubPath(SwStroke& stroke, SwFixed startAngle, SwFixed lineLength)
+{
+    SwPoint delta = {static_cast<SwCoord>(stroke.width), 0};
+    mathRotate(delta, startAngle + SW_ANGLE_PI2);
+    SCALE(stroke, delta);
+
+    auto pt = stroke.center + delta;
+    auto border = stroke.borders;
+    _borderMoveTo(border, pt);
+
+    pt = stroke.center - delta;
+    ++border;
+    _borderMoveTo(border, pt);
+
+    /* Save angle, position and line length for last join
+       lineLength is zero for curves */
+    stroke.subPathAngle = startAngle;
+    stroke.firstPt = false;
+    stroke.subPathLineLength = lineLength;
+}
+
+
+static void _lineTo(SwStroke& stroke, const SwPoint& to)
+{
+    auto delta = to - stroke.center;
+
+    //a zero-length lineto is a no-op
+    if (delta.zero()) {
+        //round and square caps are expected to be drawn as a dot even for zero-length lines
+        if (stroke.firstPt && stroke.cap != StrokeCap::Butt) _firstSubPath(stroke, 0, 0); 
+        return; 
+    }
+
+    /* The lineLength is used to determine the intersection of strokes outlines.
+       The scale needs to be reverted since the stroke width has not been scaled.
+       An alternative option is to scale the width of the stroke properly by
+       calculating the mixture of the sx/sy rating on the stroke direction. */
+    delta.x = static_cast<SwCoord>(delta.x / stroke.sx);
+    delta.y = static_cast<SwCoord>(delta.y / stroke.sy);
+    auto lineLength = mathLength(delta);
+    auto angle = mathAtan(delta);
+
+    delta = {static_cast<SwCoord>(stroke.width), 0};
+    mathRotate(delta, angle + SW_ANGLE_PI2);
+    SCALE(stroke, delta);
+
+    //process corner if necessary
+    if (stroke.firstPt) {
+        /* This is the first segment of a subpath. We need to add a point to each border
+        at their respective starting point locations. */
+        _firstSubPath(stroke, angle, lineLength);
+    } else {
+        //process the current corner
+        stroke.angleOut = angle;
+        _processCorner(stroke, lineLength);
+    }
+
+    //now add a line segment to both the inside and outside paths
+    auto border = stroke.borders;
+    auto side = 1;
+
+    while (side >= 0) {
+        auto pt = to + delta;
+
+        //the ends of lineto borders are movable
+        _borderLineTo(border, pt, true);
+
+        delta.x = -delta.x;
+        delta.y = -delta.y;
+
+        --side;
+        ++border;
+    }
+
+    stroke.angleIn = angle;
+    stroke.center = to;
+    stroke.lineLength = lineLength;
+}
+
+
+static void _cubicTo(SwStroke& stroke, const SwPoint& ctrl1, const SwPoint& ctrl2, const SwPoint& to)
+{
+    SwPoint bezStack[37];   //TODO: static?
+    auto limit = bezStack + 32;
+    auto arc = bezStack;
+    auto firstArc = true;
+    arc[0] = to;
+    arc[1] = ctrl2;
+    arc[2] = ctrl1;
+    arc[3] = stroke.center;
+
+    while (arc >= bezStack) {
+        SwFixed angleIn, angleOut, angleMid;
+
+        //initialize with current direction
+        angleIn = angleOut = angleMid = stroke.angleIn;
+
+        auto valid = mathCubicAngle(arc, angleIn, angleMid, angleOut);
+
+        //valid size
+        if (valid > 0 && arc < limit) {
+            if (stroke.firstPt) stroke.angleIn = angleIn;
+            mathSplitCubic(arc);
+            arc += 3;
+            continue;
+        }
+
+        //ignoreable size
+        if (valid < 0 && arc == bezStack) {
+            stroke.center = to;
+
+            //round and square caps are expected to be drawn as a dot even for zero-length lines
+            if (stroke.firstPt && stroke.cap != StrokeCap::Butt) _firstSubPath(stroke, 0, 0);
+            return;
+        }
+
+        //small size
+        if (firstArc) {
+            firstArc = false;
+            //process corner if necessary
+            if (stroke.firstPt) {
+                _firstSubPath(stroke, angleIn, 0);
+            } else {
+                stroke.angleOut = angleIn;
+                _processCorner(stroke, 0);
+            }
+        } else if (abs(mathDiff(stroke.angleIn, angleIn)) > (SW_ANGLE_PI / 8) / 4) {
+            //if the deviation from one arc to the next is too great add a round corner
+            stroke.center = arc[3];
+            stroke.angleOut = angleIn;
+            stroke.join = StrokeJoin::Round;
+
+            _processCorner(stroke, 0);
+
+            //reinstate line join style
+            stroke.join = stroke.joinSaved;
+        }
+
+        //the arc's angle is small enough; we can add it directly to each border
+        auto theta1 = mathDiff(angleIn, angleMid) / 2;
+        auto theta2 = mathDiff(angleMid, angleOut) / 2;
+        auto phi1 = mathMean(angleIn, angleMid);
+        auto phi2 = mathMean(angleMid, angleOut);
+        auto length1 = mathDivide(stroke.width, mathCos(theta1));
+        auto length2 = mathDivide(stroke.width, mathCos(theta2));
+        SwFixed alpha0 = 0;
+
+        //compute direction of original arc
+        if (stroke.handleWideStrokes) {
+            alpha0 = mathAtan(arc[0] - arc[3]);
+        }
+
+        auto border = stroke.borders;
+        int32_t side = 0;
+
+        while (side < 2) {
+            auto rotate = SIDE_TO_ROTATE(side);
+
+            //compute control points
+            SwPoint _ctrl1 = {static_cast<SwCoord>(length1), 0};
+            mathRotate(_ctrl1, phi1 + rotate);
+            SCALE(stroke, _ctrl1);
+            _ctrl1 += arc[2];
+
+            SwPoint _ctrl2 = {static_cast<SwCoord>(length2), 0};
+            mathRotate(_ctrl2, phi2 + rotate);
+            SCALE(stroke, _ctrl2);
+            _ctrl2 += arc[1];
+
+            //compute end point
+            SwPoint _end = {static_cast<SwCoord>(stroke.width), 0};
+            mathRotate(_end, angleOut + rotate);
+            SCALE(stroke, _end);
+            _end += arc[0];
+
+            if (stroke.handleWideStrokes) {
+                /* determine whether the border radius is greater than the radius of
+                   curvature of the original arc */
+                auto _start = border->pts[border->ptsCnt - 1];
+                auto alpha1 = mathAtan(_end - _start);
+
+                //is the direction of the border arc opposite to that of the original arc?
+                if (abs(mathDiff(alpha0, alpha1)) > SW_ANGLE_PI / 2) {
+
+                    //use the sine rule to find the intersection point
+                    auto beta = mathAtan(arc[3] - _start);
+                    auto gamma = mathAtan(arc[0] - _end);
+                    auto bvec = _end - _start;
+                    auto blen = mathLength(bvec);
+                    auto sinA = abs(mathSin(alpha1 - gamma));
+                    auto sinB = abs(mathSin(beta - gamma));
+                    auto alen = mathMulDiv(blen, sinA, sinB);
+
+                    SwPoint delta = {static_cast<SwCoord>(alen), 0};
+                    mathRotate(delta, beta);
+                    delta += _start;
+
+                    //circumnavigate the negative sector backwards
+                    border->movable = false;
+                    _borderLineTo(border, delta, false);
+                    _borderLineTo(border, _end, false);
+                    _borderCubicTo(border, _ctrl2, _ctrl1, _start);
+
+                    //and then move to the endpoint
+                    _borderLineTo(border, _end, false);
+
+                    ++side;
+                    ++border;
+                    continue;
+                }
+            }
+            _borderCubicTo(border, _ctrl1, _ctrl2, _end);
+            ++side;
+            ++border;
+        }
+        arc -= 3;
+        stroke.angleIn = angleOut;
+    }
+    stroke.center = to;
+}
+
+
+static void _addCap(SwStroke& stroke, SwFixed angle, int32_t side)
+{
+    if (stroke.cap == StrokeCap::Square) {
+        auto rotate = SIDE_TO_ROTATE(side);
+        auto border = stroke.borders + side;
+
+        SwPoint delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, angle);
+        SCALE(stroke, delta);
+
+        SwPoint delta2 = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta2, angle + rotate);
+        SCALE(stroke, delta2);
+        delta += stroke.center + delta2;
+
+        _borderLineTo(border, delta, false);
+
+        delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, angle);
+        SCALE(stroke, delta);
+
+        delta2 = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta2, angle - rotate);
+        SCALE(stroke, delta2);
+        delta += delta2 + stroke.center;
+
+        _borderLineTo(border, delta, false);
+
+    } else if (stroke.cap == StrokeCap::Round) {
+
+        stroke.angleIn = angle;
+        stroke.angleOut = angle + SW_ANGLE_PI;
+        _arcTo(stroke, side);
+        return;
+
+    } else {  //Butt
+        auto rotate = SIDE_TO_ROTATE(side);
+        auto border = stroke.borders + side;
+
+        SwPoint delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, angle + rotate);
+        SCALE(stroke, delta);
+        delta += stroke.center;
+
+        _borderLineTo(border, delta, false);
+
+        delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, angle - rotate);
+        SCALE(stroke, delta);
+        delta += stroke.center;
+
+        _borderLineTo(border, delta, false);
+    }
+}
+
+
+static void _addReverseLeft(SwStroke& stroke, bool opened)
+{
+    auto right = stroke.borders + 0;
+    auto left = stroke.borders + 1;
+    auto newPts = left->ptsCnt - left->start;
+
+    if (newPts <= 0) return;
+
+    _growBorder(right, newPts);
+
+    auto dstPt = right->pts + right->ptsCnt;
+    auto dstTag = right->tags + right->ptsCnt;
+    auto srcPt = left->pts + left->ptsCnt - 1;
+    auto srcTag = left->tags + left->ptsCnt - 1;
+
+    while (srcPt >= left->pts + left->start) {
+        *dstPt = *srcPt;
+        *dstTag = *srcTag;
+
+        if (opened) {
+             dstTag[0] &= ~(SW_STROKE_TAG_BEGIN | SW_STROKE_TAG_END);
+        } else {
+            //switch begin/end tags if necessary
+            auto ttag = dstTag[0] & (SW_STROKE_TAG_BEGIN | SW_STROKE_TAG_END);
+            if (ttag == SW_STROKE_TAG_BEGIN || ttag == SW_STROKE_TAG_END)
+              dstTag[0] ^= (SW_STROKE_TAG_BEGIN | SW_STROKE_TAG_END);
+        }
+        --srcPt;
+        --srcTag;
+        ++dstPt;
+        ++dstTag;
+    }
+
+    left->ptsCnt = left->start;
+    right->ptsCnt += newPts;
+    right->movable = false;
+    left->movable = false;
+}
+
+
+static void _beginSubPath(SwStroke& stroke, const SwPoint& to, bool closed)
+{
+    /* We cannot process the first point because there is not enough
+       information regarding its corner/cap. Later, it will be processed
+       in the _endSubPath() */
+
+    stroke.firstPt = true;
+    stroke.center = to;
+    stroke.closedSubPath = closed;
+
+    /* Determine if we need to check whether the border radius is greater
+       than the radius of curvature of a curve, to handle this case specially.
+       This is only required if bevel joins or butt caps may be created because
+       round & miter joins and round & square caps cover the negative sector
+       created with wide strokes. */
+    if ((stroke.join != StrokeJoin::Round) || (!stroke.closedSubPath && stroke.cap == StrokeCap::Butt))
+        stroke.handleWideStrokes = true;
+    else
+        stroke.handleWideStrokes = false;
+
+    stroke.ptStartSubPath = to;
+    stroke.angleIn = 0;
+}
+
+
+static void _endSubPath(SwStroke& stroke)
+{
+    if (stroke.closedSubPath) {
+        //close the path if needed
+        if (stroke.center != stroke.ptStartSubPath)
+            _lineTo(stroke, stroke.ptStartSubPath);
+
+        //process the corner
+        stroke.angleOut = stroke.subPathAngle;
+        auto turn = mathDiff(stroke.angleIn, stroke.angleOut);
+
+        //No specific corner processing is required if the turn is 0
+        if (turn != 0) {
+            //when we turn to the right, the inside is 0
+            int32_t inside = 0;
+
+            //otherwise, the inside is 1
+            if (turn < 0) inside = 1;
+
+            _inside(stroke, inside, stroke.subPathLineLength);        //inside
+            _outside(stroke, 1 - inside, stroke.subPathLineLength);   //outside
+        }
+
+        _borderClose(stroke.borders + 0, false);
+        _borderClose(stroke.borders + 1, true);
+    } else {
+        auto right = stroke.borders;
+
+        /* all right, this is an opened path, we need to add a cap between
+           right & left, add the reverse of left, then add a final cap
+           between left & right */
+        _addCap(stroke, stroke.angleIn, 0);
+
+        //add reversed points from 'left' to 'right'
+        _addReverseLeft(stroke, true);
+
+        //now add the final cap
+        stroke.center = stroke.ptStartSubPath;
+        _addCap(stroke, stroke.subPathAngle + SW_ANGLE_PI, 0);
+
+        /* now end the right subpath accordingly. The left one is rewind
+           and doesn't need further processing */
+        _borderClose(right, false);
+    }
+}
+
+
+static void _getCounts(SwStrokeBorder* border, uint32_t& ptsCnt, uint32_t& cntrsCnt)
+{
+    auto count = border->ptsCnt;
+    auto tags = border->tags;
+    uint32_t _ptsCnt = 0;
+    uint32_t _cntrsCnt = 0;
+    bool inCntr = false;
+
+    while (count > 0) {
+        if (tags[0] & SW_STROKE_TAG_BEGIN) {
+            if (inCntr) goto fail;
+            inCntr = true;
+        } else if (!inCntr) goto fail;
+
+        if (tags[0] & SW_STROKE_TAG_END) {
+            inCntr = false;
+            ++_cntrsCnt;
+        }
+        --count;
+        ++_ptsCnt;
+        ++tags;
+    }
+
+    if (inCntr) goto fail;
+
+    ptsCnt = _ptsCnt;
+    cntrsCnt = _cntrsCnt;
+
+    return;
+
+fail:
+    ptsCnt = 0;
+    cntrsCnt = 0;
+}
+
+
+static void _exportBorderOutline(const SwStroke& stroke, SwOutline* outline, uint32_t side)
+{
+    auto border = stroke.borders + side;
+    if (border->ptsCnt == 0) return;
+
+    memcpy(outline->pts.data + outline->pts.count, border->pts, border->ptsCnt * sizeof(SwPoint));
+
+    auto cnt = border->ptsCnt;
+    auto src = border->tags;
+    auto tags = outline->types.data + outline->types.count;
+    auto idx = outline->pts.count;
+
+    while (cnt > 0) {
+        if (*src & SW_STROKE_TAG_POINT) *tags = SW_CURVE_TYPE_POINT;
+        else if (*src & SW_STROKE_TAG_CUBIC) *tags = SW_CURVE_TYPE_CUBIC;
+        else TVGERR("SW_ENGINE", "Invalid stroke tag was given! = %d", *src);
+        if (*src & SW_STROKE_TAG_END) outline->cntrs.push(idx);
+        ++src;
+        ++tags;
+        ++idx;
+        --cnt;
+    }
+    outline->pts.count += border->ptsCnt;
+    outline->types.count += border->ptsCnt;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+void strokeFree(SwStroke* stroke)
+{
+    if (!stroke) return;
+
+    //free borders
+    if (stroke->borders[0].pts) free(stroke->borders[0].pts);
+    if (stroke->borders[0].tags) free(stroke->borders[0].tags);
+    if (stroke->borders[1].pts) free(stroke->borders[1].pts);
+    if (stroke->borders[1].tags) free(stroke->borders[1].tags);
+
+    fillFree(stroke->fill);
+    stroke->fill = nullptr;
+
+    free(stroke);
+}
+
+
+void strokeReset(SwStroke* stroke, const RenderShape* rshape, const Matrix& transform)
+{
+    stroke->sx = sqrtf(powf(transform.e11, 2.0f) + powf(transform.e21, 2.0f));
+    stroke->sy = sqrtf(powf(transform.e12, 2.0f) + powf(transform.e22, 2.0f));
+    stroke->width = HALF_STROKE(rshape->strokeWidth());
+    stroke->cap = rshape->strokeCap();
+    stroke->miterlimit = static_cast<SwFixed>(rshape->strokeMiterlimit() * 65536.0f);
+
+    //Save line join: it can be temporarily changed when stroking curves...
+    stroke->joinSaved = stroke->join = rshape->strokeJoin();
+
+    stroke->borders[0].ptsCnt = 0;
+    stroke->borders[0].start = -1;
+    stroke->borders[1].ptsCnt = 0;
+    stroke->borders[1].start = -1;
+}
+
+
+bool strokeParseOutline(SwStroke* stroke, const SwOutline& outline)
+{
+    uint32_t first = 0;
+    uint32_t i = 0;
+
+    for (auto cntr = outline.cntrs.begin(); cntr < outline.cntrs.end(); ++cntr, ++i) {
+        auto last = *cntr;           //index of last point in contour
+        auto limit = outline.pts.data + last;
+
+        //Skip empty points
+        if (last <= first) {
+            first = last + 1;
+            continue;
+        }
+
+        auto start = outline.pts[first];
+        auto pt = outline.pts.data + first;
+        auto types = outline.types.data + first;
+        auto type = types[0];
+
+        //A contour cannot start with a cubic control point
+        if (type == SW_CURVE_TYPE_CUBIC) return false;
+        ++types;
+
+        auto closed =  outline.closed.data ? outline.closed.data[i]: false;
+
+        _beginSubPath(*stroke, start, closed);
+
+        while (pt < limit) {
+            //emit a single line_to
+            if (types[0] == SW_CURVE_TYPE_POINT) {
+                ++pt;
+                ++types;
+                _lineTo(*stroke, *pt);
+            //types cubic
+            } else {
+                pt += 3;
+                types += 3;
+                if (pt <= limit) _cubicTo(*stroke, pt[-2], pt[-1], pt[0]);
+                else if (pt - 1 == limit) _cubicTo(*stroke, pt[-2], pt[-1], start);
+                else goto close;
+            }
+        }
+    close:
+        if (!stroke->firstPt) _endSubPath(*stroke);
+        first = last + 1;
+    }
+    return true;
+}
+
+
+SwOutline* strokeExportOutline(SwStroke* stroke, SwMpool* mpool, unsigned tid)
+{
+    uint32_t count1, count2, count3, count4;
+
+    _getCounts(stroke->borders + 0, count1, count2);
+    _getCounts(stroke->borders + 1, count3, count4);
+
+    auto ptsCnt = count1 + count3;
+    auto cntrsCnt = count2 + count4;
+
+    auto outline = mpoolReqStrokeOutline(mpool, tid);
+    outline->pts.reserve(ptsCnt);
+    outline->types.reserve(ptsCnt);
+    outline->cntrs.reserve(cntrsCnt);
+
+    _exportBorderOutline(*stroke, outline, 0);  //left
+    _exportBorderOutline(*stroke, outline, 1);  //right
+
+    return outline;
+}