generate_lang_file CLI command: add -F/--fix-utf8 parameter

Add an option to allow to try to load and fix existing translation files with broken UTF-8 characters.
2024-11-18 00:09:06 +01:00 · 2021-10-08 17:20:31 +02:00 · 2021-10-08 17:20:31 +02:00 · 27f06b7b66
commit 27f06b7b66
parent c0d874c68d
2 changed files with 372 additions and 2 deletions
--- a/src/includes/class/class.LSlang.php
+++ b/src/includes/class/class.LSlang.php
@ -260,6 +260,7 @@ function cli_generate_lang_file($command_args) {
  $output = False;
  $additionalfileformat = False;
  $keep_unused = False;
+  $fix_utf8 = False;
  $lang = null;
  $encoding = null;
  $available_formats = array('php', 'pot');
@ -370,6 +371,11 @@ function cli_generate_lang_file($command_args) {
        $keep_unused = true;
        break;

+      case '--fix-utf8':
+      case '-F':
+        $fix_utf8 = true;
+        break;
+
      default:
        $path = relative2absolute_path($command_args[$i]);
        if (is_file($path))
@ -489,12 +495,19 @@ function cli_generate_lang_file($command_args) {
    }
  }

+  // In fix-utf8 mode, load ForceUT8/Encoding lib
+  if ($fix_utf8)
+    LSsession :: includeFile(LS_LIB_DIR . "ForceUTF8/Encoding.php");
+
  // Load translation files
  foreach($load_files as $path) {
    $LSlang_cli_logger -> debug("Load $path lang file");
    @include($path);
    foreach($GLOBALS['LSlang'] as $msg => $trans) {
-      $translations[$msg]=$trans;
+      if ($fix_utf8)
+        $translations[\ForceUTF8\Encoding::fixUTF8($msg)] = \ForceUTF8\Encoding::fixUTF8($trans);
+      else
+        $translations[$msg] = $trans;
    }
  }

@ -504,6 +517,9 @@ function cli_generate_lang_file($command_args) {
  // Load lang string if lang was specify
  if ($lang && $encoding && isset($GLOBALS['LSlang']) && is_array($GLOBALS['LSlang'])) {
    foreach($GLOBALS['LSlang'] as $msg => $trans) {
+      if ($fix_utf8)
+        $translations[\ForceUTF8\Encoding::fixUTF8($msg)] = \ForceUTF8\Encoding::fixUTF8($trans);
+      else
        $translations[$msg] = $trans;
    }
  }
@ -980,6 +996,7 @@ function cli_generate_lang_file_args_autocompleter($comp_words, $comp_word_num,
      '-f', '--format',
      '-I', '--include-upstream',
      '-K', '--keep-unused',
+      '-F', '--fix-utf8',
    )
  );
  return LScli :: autocomplete_opts($opts, $comp_word);
@ -1009,6 +1026,8 @@ LScli :: add_command(
    "  -f/--format                 Output file format : php or pot",
    "                              (default: php)",
    "  -K/--keep-unused            Keep unused translations in resulting file",
+    "  -F/--fix-utf8               Try to load and fix broken UTF-8 characters in",
+    "                              existing lang files."
  ),
  false,  // This command does not need LDAP connection
  'cli_generate_lang_file_args_autocompleter'
--- a/src/includes/libs/ForceUTF8/Encoding.php
+++ b/src/includes/libs/ForceUTF8/Encoding.php
@ -0,0 +1,351 @@
+<?php
+/*
+Copyright (c) 2008 Sebastián Grignoli
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of copyright holders nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+ * @author   "Sebastián Grignoli" <grignoli@gmail.com>
+ * @package  Encoding
+ * @version  2.0
+ * @link     https://github.com/neitanod/forceutf8
+ * @example  https://github.com/neitanod/forceutf8
+ * @license  Revised BSD
+  */
+
+namespace ForceUTF8;
+
+class Encoding {
+
+  const ICONV_TRANSLIT = "TRANSLIT";
+  const ICONV_IGNORE = "IGNORE";
+  const WITHOUT_ICONV = "";
+
+  protected static $win1252ToUtf8 = array(
+        128 => "\xe2\x82\xac",
+
+        130 => "\xe2\x80\x9a",
+        131 => "\xc6\x92",
+        132 => "\xe2\x80\x9e",
+        133 => "\xe2\x80\xa6",
+        134 => "\xe2\x80\xa0",
+        135 => "\xe2\x80\xa1",
+        136 => "\xcb\x86",
+        137 => "\xe2\x80\xb0",
+        138 => "\xc5\xa0",
+        139 => "\xe2\x80\xb9",
+        140 => "\xc5\x92",
+
+        142 => "\xc5\xbd",
+
+
+        145 => "\xe2\x80\x98",
+        146 => "\xe2\x80\x99",
+        147 => "\xe2\x80\x9c",
+        148 => "\xe2\x80\x9d",
+        149 => "\xe2\x80\xa2",
+        150 => "\xe2\x80\x93",
+        151 => "\xe2\x80\x94",
+        152 => "\xcb\x9c",
+        153 => "\xe2\x84\xa2",
+        154 => "\xc5\xa1",
+        155 => "\xe2\x80\xba",
+        156 => "\xc5\x93",
+
+        158 => "\xc5\xbe",
+        159 => "\xc5\xb8"
+  );
+
+    protected static $brokenUtf8ToUtf8 = array(
+        "\xc2\x80" => "\xe2\x82\xac",
+
+        "\xc2\x82" => "\xe2\x80\x9a",
+        "\xc2\x83" => "\xc6\x92",
+        "\xc2\x84" => "\xe2\x80\x9e",
+        "\xc2\x85" => "\xe2\x80\xa6",
+        "\xc2\x86" => "\xe2\x80\xa0",
+        "\xc2\x87" => "\xe2\x80\xa1",
+        "\xc2\x88" => "\xcb\x86",
+        "\xc2\x89" => "\xe2\x80\xb0",
+        "\xc2\x8a" => "\xc5\xa0",
+        "\xc2\x8b" => "\xe2\x80\xb9",
+        "\xc2\x8c" => "\xc5\x92",
+
+        "\xc2\x8e" => "\xc5\xbd",
+
+
+        "\xc2\x91" => "\xe2\x80\x98",
+        "\xc2\x92" => "\xe2\x80\x99",
+        "\xc2\x93" => "\xe2\x80\x9c",
+        "\xc2\x94" => "\xe2\x80\x9d",
+        "\xc2\x95" => "\xe2\x80\xa2",
+        "\xc2\x96" => "\xe2\x80\x93",
+        "\xc2\x97" => "\xe2\x80\x94",
+        "\xc2\x98" => "\xcb\x9c",
+        "\xc2\x99" => "\xe2\x84\xa2",
+        "\xc2\x9a" => "\xc5\xa1",
+        "\xc2\x9b" => "\xe2\x80\xba",
+        "\xc2\x9c" => "\xc5\x93",
+
+        "\xc2\x9e" => "\xc5\xbe",
+        "\xc2\x9f" => "\xc5\xb8"
+  );
+
+  protected static $utf8ToWin1252 = array(
+       "\xe2\x82\xac" => "\x80",
+
+       "\xe2\x80\x9a" => "\x82",
+       "\xc6\x92"     => "\x83",
+       "\xe2\x80\x9e" => "\x84",
+       "\xe2\x80\xa6" => "\x85",
+       "\xe2\x80\xa0" => "\x86",
+       "\xe2\x80\xa1" => "\x87",
+       "\xcb\x86"     => "\x88",
+       "\xe2\x80\xb0" => "\x89",
+       "\xc5\xa0"     => "\x8a",
+       "\xe2\x80\xb9" => "\x8b",
+       "\xc5\x92"     => "\x8c",
+
+       "\xc5\xbd"     => "\x8e",
+
+
+       "\xe2\x80\x98" => "\x91",
+       "\xe2\x80\x99" => "\x92",
+       "\xe2\x80\x9c" => "\x93",
+       "\xe2\x80\x9d" => "\x94",
+       "\xe2\x80\xa2" => "\x95",
+       "\xe2\x80\x93" => "\x96",
+       "\xe2\x80\x94" => "\x97",
+       "\xcb\x9c"     => "\x98",
+       "\xe2\x84\xa2" => "\x99",
+       "\xc5\xa1"     => "\x9a",
+       "\xe2\x80\xba" => "\x9b",
+       "\xc5\x93"     => "\x9c",
+
+       "\xc5\xbe"     => "\x9e",
+       "\xc5\xb8"     => "\x9f"
+    );
+
+  static function toUTF8($text){
+  /**
+   * Function \ForceUTF8\Encoding::toUTF8
+   *
+   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
+   *
+   * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
+   *
+   * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
+   *
+   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
+   *    are followed by any of these:  ("group B")
+   *                                    ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
+   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
+   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
+   * is also a valid unicode character, and will be left unchanged.
+   *
+   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
+   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
+   *
+   * @name toUTF8
+   * @param string $text  Any string.
+   * @return string  The same string, UTF8 encoded
+   *
+   */
+
+    if(is_array($text))
+    {
+      foreach($text as $k => $v)
+      {
+        $text[$k] = self::toUTF8($v);
+      }
+      return $text;
+    }
+
+    if(!is_string($text)) {
+      return $text;
+    }
+
+    $max = self::strlen($text);
+
+    $buf = "";
+    for($i = 0; $i < $max; $i++){
+        $c1 = $text[$i];
+        if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
+          $c2 = $i+1 >= $max? "\x00" : $text[$i+1];
+          $c3 = $i+2 >= $max? "\x00" : $text[$i+2];
+          $c4 = $i+3 >= $max? "\x00" : $text[$i+3];
+            if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
+                if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                    $buf .= $c1 . $c2;
+                    $i++;
+                } else { //not valid UTF8.  Convert it.
+                    $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                    $cc2 = ($c1 & "\x3f") | "\x80";
+                    $buf .= $cc1 . $cc2;
+                }
+            } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
+                if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                    $buf .= $c1 . $c2 . $c3;
+                    $i = $i + 2;
+                } else { //not valid UTF8.  Convert it.
+                    $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                    $cc2 = ($c1 & "\x3f") | "\x80";
+                    $buf .= $cc1 . $cc2;
+                }
+            } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
+                if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                    $buf .= $c1 . $c2 . $c3 . $c4;
+                    $i = $i + 3;
+                } else { //not valid UTF8.  Convert it.
+                    $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                    $cc2 = ($c1 & "\x3f") | "\x80";
+                    $buf .= $cc1 . $cc2;
+                }
+            } else { //doesn't look like UTF8, but should be converted
+                    $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                    $cc2 = (($c1 & "\x3f") | "\x80");
+                    $buf .= $cc1 . $cc2;
+            }
+        } elseif(($c1 & "\xc0") === "\x80"){ // needs conversion
+              if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
+                  $buf .= self::$win1252ToUtf8[ord($c1)];
+              } else {
+                $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                $cc2 = (($c1 & "\x3f") | "\x80");
+                $buf .= $cc1 . $cc2;
+              }
+        } else { // it doesn't need conversion
+            $buf .= $c1;
+        }
+    }
+    return $buf;
+  }
+
+  static function toWin1252($text, $option = self::WITHOUT_ICONV) {
+    if(is_array($text)) {
+      foreach($text as $k => $v) {
+        $text[$k] = self::toWin1252($v, $option);
+      }
+      return $text;
+    } elseif(is_string($text)) {
+      return static::utf8_decode($text, $option);
+    } else {
+      return $text;
+    }
+  }
+
+  static function toISO8859($text, $option = self::WITHOUT_ICONV) {
+    return self::toWin1252($text, $option);
+  }
+
+  static function toLatin1($text, $option = self::WITHOUT_ICONV) {
+    return self::toWin1252($text, $option);
+  }
+
+  static function fixUTF8($text, $option = self::WITHOUT_ICONV){
+    if(is_array($text)) {
+      foreach($text as $k => $v) {
+        $text[$k] = self::fixUTF8($v, $option);
+      }
+      return $text;
+    }
+
+    if(!is_string($text)) {
+      return $text;
+    }
+
+    $last = "";
+    while($last <> $text){
+      $last = $text;
+      $text = self::toUTF8(static::utf8_decode($text, $option));
+    }
+    $text = self::toUTF8(static::utf8_decode($text, $option));
+    return $text;
+  }
+
+  static function UTF8FixWin1252Chars($text){
+    // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
+    // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
+    // See: http://en.wikipedia.org/wiki/Windows-1252
+
+    return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
+  }
+
+  static function removeBOM($str=""){
+    if(substr($str, 0,3) === pack("CCC",0xef,0xbb,0xbf)) {
+      $str=substr($str, 3);
+    }
+    return $str;
+  }
+
+  protected static function strlen($text){
+    return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) ?
+           mb_strlen($text,'8bit') : strlen($text);
+  }
+
+  public static function normalizeEncoding($encodingLabel)
+  {
+    $encoding = strtoupper($encodingLabel);
+    $encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
+    $equivalences = array(
+        'ISO88591' => 'ISO-8859-1',
+        'ISO8859'  => 'ISO-8859-1',
+        'ISO'      => 'ISO-8859-1',
+        'LATIN1'   => 'ISO-8859-1',
+        'LATIN'    => 'ISO-8859-1',
+        'UTF8'     => 'UTF-8',
+        'UTF'      => 'UTF-8',
+        'WIN1252'  => 'ISO-8859-1',
+        'WINDOWS1252' => 'ISO-8859-1'
+    );
+
+    if(empty($equivalences[$encoding])){
+      return 'UTF-8';
+    }
+
+    return $equivalences[$encoding];
+  }
+
+  public static function encode($encodingLabel, $text)
+  {
+    $encodingLabel = self::normalizeEncoding($encodingLabel);
+    if($encodingLabel === 'ISO-8859-1') return self::toLatin1($text);
+    return self::toUTF8($text);
+  }
+
+  protected static function utf8_decode($text, $option = self::WITHOUT_ICONV)
+  {
+    if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) {
+       $o = utf8_decode(
+         str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text))
+       );
+    } else {
+       $o = iconv("UTF-8", "Windows-1252" . ($option === self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option === self::ICONV_IGNORE ? '//IGNORE' : '')), $text);
+    }
+    return $o;
+  }
+}