STRATO-apps/wordpress_03/app/wp-content/plugins/aimogen-pro/res/tokenizer/Gpt3Tokenizer.php

SHA-256: e07dd5163c2dde603b2d7972d00fe318f9965c3cfc07b41f3f81f18bc4809632
<?php

namespace Gioni06\Gpt3Tokenizer;

defined('ABSPATH') or die();
class Gpt3Tokenizer
{
    const PAT_REGEX = "/'s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^[:space:]\pL\pN]+|\s+(?!\S)|\s+/u";
    private mixed $vocab;
    private array $bpeMerges;
    private array $bpe_ranks;
    private bool $apcuAvailable;

    private array $cache = [];

    private bool $useCache;


    public function __construct(Gpt3TokenizerConfig $config)
    {
        $vocabPath = $config->getConfig()['vocabPath'];
        $vocab = new Vocab($vocabPath);
        $this->vocab = $vocab->data();
        // Free memory that is no longer needed
        unset($vocab);

        $mergesPath = $config->getConfig()['mergesPath'];
        $merges = new Merges($mergesPath);
        $this->bpeMerges = $merges->bpeMerges();
        $keys = Gpt3Tokenizer::zipBpe($this->bpeMerges);
        $values = range(0, count($this->bpeMerges) - 1);
        if (count($keys) !== count($values)) 
        {
            $min_length = min(count($keys), count($values));
            $keys = array_slice($keys, 0, $min_length);
            $values = array_slice($values, 0, $min_length);
        }
        $this->bpe_ranks = array_combine($keys, $values);
        // Free memory that is no longer needed
        unset($this->bpeMerges);
        unset($merges);

        $this->apcuAvailable = function_exists('apcu_enabled') && apcu_enabled();
        $this->useCache = $config->getConfig()['useCache'];
    }

    private function cacheSet($key, $val): void
    {
        if ($this->apcuAvailable) {
            /** @noinspection PhpComposerExtensionStubsInspection */
            apcu_store($key, $val);
        } else {
            $this->cache[$key] = $val;
        }
    }

    private function cacheGet($key): mixed
    {
        if ($this->apcuAvailable) {
            /** @noinspection PhpComposerExtensionStubsInspection */
            return apcu_fetch($key);
        } else {
            return $this->cache[$key] ?? null;
        }
    }

    private function cacheExists($key): array|bool
    {
        if ($this->apcuAvailable) {
            /** @noinspection PhpComposerExtensionStubsInspection */
            return apcu_exists($key);
        } else {
            return isset($this->cache[$key]);
        }
    }

    public static function bytes_to_unicode(): array
    {
        // Bytes-to-Unicode is a list of utf-8 byte and a corresponding unicode string.
        // Using this static list is much faster than decoding the utf-8 everytime a character is encountered.
        // Also, it produces the exact output as tokenizer from OpenAI uses. https://beta.openai.com/tokenizer
        return [
            0 => 'Ā',
            1 => 'ā',
            2 => 'Ă',
            3 => 'ă',
            4 => 'Ą',
            5 => 'ą',
            6 => 'Ć',
            7 => 'ć',
            8 => 'Ĉ',
            9 => 'ĉ',
            10 => 'Ċ',
            11 => 'ċ',
            12 => 'Č',
            13 => 'č',
            14 => 'Ď',
            15 => 'ď',
            16 => 'Đ',
            17 => 'đ',
            18 => 'Ē',
            19 => 'ē',
            20 => 'Ĕ',
            21 => 'ĕ',
            22 => 'Ė',
            23 => 'ė',
            24 => 'Ę',
            25 => 'ę',
            26 => 'Ě',
            27 => 'ě',
            28 => 'Ĝ',
            29 => 'ĝ',
            30 => 'Ğ',
            31 => 'ğ',
            32 => 'Ġ',
            33 => '!',
            34 => '"',
            35 => '#',
            36 => '$',
            37 => '%',
            38 => '&',
            39 => '\'',
            40 => '(',
            41 => ')',
            42 => '*',
            43 => '+',
            44 => ',',
            45 => '-',
            46 => '.',
            47 => '/',
            48 => '0',
            49 => '1',
            50 => '2',
            51 => '3',
            52 => '4',
            53 => '5',
            54 => '6',
            55 => '7',
            56 => '8',
            57 => '9',
            58 => ':',
            59 => ';',
            60 => '<',
            61 => '=',
            62 => '>',
            63 => '?',
            64 => '@',
            65 => 'A',
            66 => 'B',
            67 => 'C',
            68 => 'D',
            69 => 'E',
            70 => 'F',
            71 => 'G',
            72 => 'H',
            73 => 'I',
            74 => 'J',
            75 => 'K',
            76 => 'L',
            77 => 'M',
            78 => 'N',
            79 => 'O',
            80 => 'P',
            81 => 'Q',
            82 => 'R',
            83 => 'S',
            84 => 'T',
            85 => 'U',
            86 => 'V',
            87 => 'W',
            88 => 'X',
            89 => 'Y',
            90 => 'Z',
            91 => '[',
            92 => '\\',
            93 => ']',
            94 => '^',
            95 => '_',
            96 => '`',
            97 => 'a',
            98 => 'b',
            99 => 'c',
            100 => 'd',
            101 => 'e',
            102 => 'f',
            103 => 'g',
            104 => 'h',
            105 => 'i',
            106 => 'j',
            107 => 'k',
            108 => 'l',
            109 => 'm',
            110 => 'n',
            111 => 'o',
            112 => 'p',
            113 => 'q',
            114 => 'r',
            115 => 's',
            116 => 't',
            117 => 'u',
            118 => 'v',
            119 => 'w',
            120 => 'x',
            121 => 'y',
            122 => 'z',
            123 => '{',
            124 => '|',
            125 => '}',
            126 => '~',
            127 => 'ġ',
            128 => 'Ģ',
            129 => 'ģ',
            130 => 'Ĥ',
            131 => 'ĥ',
            132 => 'Ħ',
            133 => 'ħ',
            134 => 'Ĩ',
            135 => 'ĩ',
            136 => 'Ī',
            137 => 'ī',
            138 => 'Ĭ',
            139 => 'ĭ',
            140 => 'Į',
            141 => 'į',
            142 => 'İ',
            143 => 'ı',
            144 => 'IJ',
            145 => 'ij',
            146 => 'Ĵ',
            147 => 'ĵ',
            148 => 'Ķ',
            149 => 'ķ',
            150 => 'ĸ',
            151 => 'Ĺ',
            152 => 'ĺ',
            153 => 'Ļ',
            154 => 'ļ',
            155 => 'Ľ',
            156 => 'ľ',
            157 => 'Ŀ',
            158 => 'ŀ',
            159 => 'Ł',
            160 => 'ł',
            161 => '¡',
            162 => '¢',
            163 => '£',
            164 => '¤',
            165 => '¥',
            166 => '¦',
            167 => '§',
            168 => '¨',
            169 => '©',
            170 => 'ª',
            171 => '«',
            172 => '¬',
            173 => 'Ń',
            174 => '®',
            175 => '¯',
            176 => '°',
            177 => '±',
            178 => '²',
            179 => '³',
            180 => '´',
            181 => 'µ',
            182 => '¶',
            183 => '·',
            184 => '¸',
            185 => '¹',
            186 => 'º',
            187 => '»',
            188 => '¼',
            189 => '½',
            190 => '¾',
            191 => '¿',
            192 => 'À',
            193 => 'Á',
            194 => 'Â',
            195 => 'Ã',
            196 => 'Ä',
            197 => 'Å',
            198 => 'Æ',
            199 => 'Ç',
            200 => 'È',
            201 => 'É',
            202 => 'Ê',
            203 => 'Ë',
            204 => 'Ì',
            205 => 'Í',
            206 => 'Î',
            207 => 'Ï',
            208 => 'Ð',
            209 => 'Ñ',
            210 => 'Ò',
            211 => 'Ó',
            212 => 'Ô',
            213 => 'Õ',
            214 => 'Ö',
            215 => '×',
            216 => 'Ø',
            217 => 'Ù',
            218 => 'Ú',
            219 => 'Û',
            220 => 'Ü',
            221 => 'Ý',
            222 => 'Þ',
            223 => 'ß',
            224 => 'à',
            225 => 'á',
            226 => 'â',
            227 => 'ã',
            228 => 'ä',
            229 => 'å',
            230 => 'æ',
            231 => 'ç',
            232 => 'è',
            233 => 'é',
            234 => 'ê',
            235 => 'ë',
            236 => 'ì',
            237 => 'í',
            238 => 'î',
            239 => 'ï',
            240 => 'ð',
            241 => 'ñ',
            242 => 'ò',
            243 => 'ó',
            244 => 'ô',
            245 => 'õ',
            246 => 'ö',
            247 => '÷',
            248 => 'ø',
            249 => 'ù',
            250 => 'ú',
            251 => 'û',
            252 => 'ü',
            253 => 'ý',
            254 => 'þ',
            255 => 'ÿ',
        ];
    }

    public static function encodeStr(string $str): array {
        $bytes = str_split(bin2hex(mb_convert_encoding($str, 'UTF-8')), 2);
        return array_map(function($byte){
            return hexdec($byte);
        },$bytes);
    }

    public static function decodeStr(array $codes): string {
        $bytes = array_map(function($code) {
            return chr($code);
        }, $codes);
        return implode($bytes);
    }

    public static function get_pairs($input_arr): array
    {
        $pairs = array();
        for ($i = 0; $i < count($input_arr) - 1; $i++) {
            $pairs[] = array($input_arr[$i], $input_arr[$i + 1]);
        }
        // remove duplicates
        return array_unique($pairs, SORT_REGULAR);
    }

    public static function zipBpe(array $bpeMerges): array
    {
        $bpe = [];
        foreach ($bpeMerges as $merge) {
            $bpe[] = $merge[0] . ',' . $merge[1];
        }
        return $bpe;
    }

    public function bpe(string $token): string
    {
        if($this->useCache && $this->cacheExists($token)) {
            return $this->cacheGet($token);
        }

        $chars = mb_str_split($token);
        $pairs = self::get_pairs($chars);
        if(!count($pairs)) {
            return implode(" ", $chars);
        }

        while (true) {
            $minPairs = [];
            foreach ($pairs as $pair) {
                $pairStr = implode(",", $pair);
                if (array_key_exists($pairStr, $this->bpe_ranks)) {
                    $minPairs[$this->bpe_ranks[$pairStr]] = $pair;
                } else {
                    $minPairs[10e10] = $pair;
                }
            }
            ksort($minPairs);

            $bigram = $minPairs[min(array_map(function($x) {
                return intval($x);
            }, array_keys($minPairs)))];

            $bigramStr = implode(",", $bigram);
            if (!array_key_exists($bigramStr, $this->bpe_ranks)) {
                break;
            }

            $first = $bigram[0];
            $second = $bigram[1];
            $new_word = array();
            $i = 0;

            while ($i < count($chars)) {
                $j = array_search($first, array_slice($chars, $i));
                if ($j === false) {
                    $new_word = array_merge($new_word, array_slice($chars, $i));
                    break;
                }
                $new_word = array_merge($new_word, array_slice($chars, $i, $j));
                $i = $i + $j;

                if ($chars[$i] === $first && $i < count($chars) - 1 && $chars[$i + 1] === $second) {
                    $new_word[] = $first . $second;
                    $i = $i + 2;
                } else {
                    $new_word[] = $chars[$i];
                    $i++;
                }
            }
            $chars = $new_word;
            if (count($chars) === 1) {
                break;
            } else {
                $pairs = self::get_pairs($chars);
            }
        }
        $result = implode(" ", $chars);
        if($this->useCache) {
            $this->cacheSet($token, $result);
        }
        return $result;
    }

    public function encode(string $text): array
    {
        $byte_encoder = self::bytes_to_unicode();
        $bpe_tokens = array();
        $matches = array();
        preg_match_all(self::PAT_REGEX, $text, $matches);
        foreach ($matches[0] as $token) {
            $token = implode(array_map(function($x) use ($byte_encoder) {
                return $byte_encoder[$x];
            }, self::encodeStr($token)));

            $new_tokens = array_map(function($x) {
                return $this->vocab[$x];
            }, explode(' ', $this->bpe($token)));
            $bpe_tokens = array_merge($bpe_tokens, $new_tokens);
        }
        return $bpe_tokens;
    }

    /**
     * Encodes a given text into chunks of Byte-Pair Encoded (BPE) tokens, with each chunk containing a specified
     * maximum number of tokens.
     * @param string $text The input text to be encoded.
     * @param int $maxTokenPerChunk The maximum number of tokens allowed per chunk.
     * @return int[][] An array of arrays containing BPE token chunks.
     */
    public function encodeInChunks(string $text, int $maxTokenPerChunk): array
    {
        $byte_encoder = self::bytes_to_unicode();

        $bpe_tokens_chunks = array();
        $bpe_tokens_current_chunk = array();

        $matches = array();
        preg_match_all(self::PAT_REGEX, $text, $matches);
        foreach ($matches[0] as $token) {
            $token = implode(array_map(function($x) use ($byte_encoder) {
                return $byte_encoder[$x];
            }, self::encodeStr($token)));

            $new_tokens = array_map(function($x) {
                return $this->vocab[$x];
            }, explode(' ', $this->bpe($token)));

            if ((count($bpe_tokens_current_chunk) + count($new_tokens)) > $maxTokenPerChunk) {
                $bpe_tokens_chunks[] = $bpe_tokens_current_chunk;
                $bpe_tokens_current_chunk = array();
            }

            $bpe_tokens_current_chunk = array_merge($bpe_tokens_current_chunk, $new_tokens);
        }

        if (count($bpe_tokens_current_chunk) > 0) {
            $bpe_tokens_chunks[] = $bpe_tokens_current_chunk;
        }

        return $bpe_tokens_chunks;
    }

    /**
     * Takes a given text and chunks it into encoded segments, with each segment containing a specified maximum
     * number of tokens.
     * @param string $text The input text to be encoded.
     * @param int $maxTokenPerChunk The maximum number of tokens allowed per chunk.
     * @return string[] An array of strings containing the encoded text.
     */
    public function chunk(string $text, int $maxTokenPerChunk): array
    {
        return array_map(
            [$this, 'decode'],
            $this->encodeInChunks($text, $maxTokenPerChunk)
        );
    }

    public function decode(array $tokens): string
    {
        $decoder = array_flip($this->vocab);
        $byte_decoder = array_flip(self::bytes_to_unicode());

        $text = array_map(function($x) use ($decoder) {
            return $decoder[$x];
        }, $tokens);

        $text = implode($text);
        $chars = mb_str_split($text);
        $decodedChars = array();
        for ($i = 0; $i < count($chars); $i++) {
            $decodedChars[] = $byte_decoder[$chars[$i]];
        }
        return self::decodeStr($decodedChars);
    }

    public function count(string $text): int
    {
        $tokens = self::encode($text);
        return count($tokens);
    }
}