소스 검색

Attempt to recover from json encoding errors

Detect and attempt to recover from json_encode errors triggered by
strings containing invalid UTF-8 sequences. Recovery will only be
attempted when encoding strings or arrays. If recovery fails then
a RuntimeException will be thrown.

The recovery process will convert invalid UTF-8 codepoints as though the
input string was encoded using the ISO-8859-15 character encoding. This
conversion may result in incorrect string output if the original
encoding was not ISO-8859-15, but it will be a valid UTF-8 string.

Closes #545
Bryan Davis 10 년 전
부모
커밋
6f9e221bd6

+ 90 - 7
src/Monolog/Formatter/NormalizerFormatter.php

@@ -138,25 +138,76 @@ class NormalizerFormatter implements FormatterInterface
         return $data;
     }
 
+    /**
+     * Return the JSON representation of a value
+     *
+     * @param mixed $data
+     * @param bool  $ignoreErrors
+     * @return string
+     * @throws \RuntimeException if encoding fails and errors are not ignored
+     */
     protected function toJson($data, $ignoreErrors = false)
     {
         // suppress json_encode errors since it's twitchy with some inputs
         if ($ignoreErrors) {
-            if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
-                return @json_encode($data, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
-            }
+            return @$this->jsonEncode($data);
+        }
 
-            return @json_encode($data);
+        $json = $this->jsonEncode($data);
+
+        if ($json === false) {
+            $json = $this->handleJsonError(json_last_error(), $data);
         }
 
+        return $json;
+    }
+
+    /**
+     * @param mixed $data
+     * @return string JSON encoded data or null on failure
+     */
+    private function jsonEncode($data)
+    {
         if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
-            $json = json_encode($data, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
+            return json_encode($data, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
+        }
+
+        return json_encode($data);
+    }
+
+    /**
+     * Handle a json_encode failure.
+     *
+     * If the failure is due to invalid string encoding, try to clean the
+     * input and encode again. If the second encoding iattempt fails, the
+     * inital error is not encoding related or the input can't be cleaned then
+     * raise a descriptive exception.
+     *
+     * @param int   $code return code of json_last_error function
+     * @param mixed $data data that was meant to be encoded
+     * @return string JSON encoded data after error correction
+     * @throws \RuntimeException if failure can't be corrected
+     */
+    private function handleJsonError($code, $data)
+    {
+        if ($code !== JSON_ERROR_UTF8) {
+            $this->throwEncodeError($code, $data);
+        }
+
+        if (is_string($data)) {
+            $this->detectAndCleanUtf8($data);
+
+        } elseif (is_array($data)) {
+            array_walk_recursive($data, array($this, 'detectAndCleanUtf8'));
+
         } else {
-            $json = json_encode($data);
+            $this->throwEncodeError($code, $data);
         }
 
+        $json = $this->jsonEncode($data);
+
         if ($json === false) {
-            $this->throwEncodeError(json_last_error(), $data);
+            $json = $this->throwEncodeError(json_last_error(), $data);
         }
 
         return $json;
@@ -190,4 +241,36 @@ class NormalizerFormatter implements FormatterInterface
 
         throw new \RuntimeException('JSON encoding failed: '.$msg.'. Encoding: '.var_export($data, true));
     }
+
+    /**
+     * Detect invalid UTF-8 string characters and convert to valid UTF-8.
+     *
+     * Valid UTF-8 input will be left unmodified, but strings containing
+     * invalid UTF-8 codepoints will be reencoded as UTF-8 with an assumed
+     * original encoding of ISO-8859-15. This conversion may result in
+     * incorrect output if the actual encoding was not ISO-8859-15, but it
+     * will be clean UTF-8 output and will not rely on expensive and fragile
+     * detection algorithms.
+     *
+     * Function converts the input in place in the passed variable so that it
+     * can be used as a callback for array_walk_recursive.
+     *
+     * @param mixed &$data Input to check and convert if needed
+     * @access private
+     */
+    public function detectAndCleanUtf8(&$data)
+    {
+        if (is_string($data) && !preg_match('//u', $data)) {
+            $data = preg_replace_callback(
+                '/[\x80-\xFF]+/',
+                function ($m) { return utf8_encode($m[0]); },
+                $data
+            );
+            $data = str_replace(
+                array('¤', '¦', '¨', '´', '¸', '¼', '½', '¾'),
+                array('€', 'Š', 'š', 'Ž', 'ž', 'Œ', 'œ', 'Ÿ'),
+                $data
+            );
+        }
+    }
 }

+ 43 - 0
tests/Monolog/Formatter/LogstashFormatterTest.php

@@ -15,6 +15,12 @@ use Monolog\Logger;
 
 class LogstashFormatterTest extends \PHPUnit_Framework_TestCase
 {
+    public function tearDown()
+    {
+        \PHPUnit_Framework_Error_Warning::$enabled = true;
+        return parent::tearDown();
+    }
+
     /**
      * @covers Monolog\Formatter\LogstashFormatter::format
      */
@@ -286,4 +292,41 @@ class LogstashFormatterTest extends \PHPUnit_Framework_TestCase
         $this->assertArrayHasKey('type', $message);
         $this->assertEquals('app', $message['type']);
     }
+
+    public function testFormatWithLatin9Data()
+    {
+        if (version_compare(PHP_VERSION, '5.5.0', '<')) {
+            // Ignore the warning that will be emitted by PHP <5.5.0
+            \PHPUnit_Framework_Error_Warning::$enabled = false;
+        }
+        $formatter = new LogstashFormatter('test', 'hostname');
+        $record = array(
+            'level' => Logger::ERROR,
+            'level_name' => 'ERROR',
+            'channel' => '¯\_(ツ)_/¯',
+            'context' => array(),
+            'datetime' => new \DateTime("@0"),
+            'extra' => array(
+                'user_agent' => "\xD6WN; FBCR/OrangeEspa\xF1a; Vers\xE3o/4.0; F\xE4rist",
+            ),
+            'message' => 'log',
+        );
+
+        $message = json_decode($formatter->format($record), true);
+
+        $this->assertEquals("1970-01-01T00:00:00.000000+00:00", $message['@timestamp']);
+        $this->assertEquals('log', $message['@message']);
+        $this->assertEquals('¯\_(ツ)_/¯', $message['@fields']['channel']);
+        $this->assertContains('¯\_(ツ)_/¯', $message['@tags']);
+        $this->assertEquals(Logger::ERROR, $message['@fields']['level']);
+        $this->assertEquals('test', $message['@type']);
+        $this->assertEquals('hostname', $message['@source']);
+        if (version_compare(PHP_VERSION, '5.5.0', '>=')) {
+            $this->assertEquals('ÖWN; FBCR/OrangeEspaña; Versão/4.0; Färist', $message['@fields']['user_agent']);
+        } else {
+            // PHP <5.5 does not return false for an element encoding failure,
+            // instead it emits a warning (possibly) and nulls the value.
+            $this->assertEquals(null, $message['@fields']['user_agent']);
+        }
+    }
 }

+ 92 - 3
tests/Monolog/Formatter/NormalizerFormatterTest.php

@@ -16,6 +16,12 @@ namespace Monolog\Formatter;
  */
 class NormalizerFormatterTest extends \PHPUnit_Framework_TestCase
 {
+    public function tearDown()
+    {
+        \PHPUnit_Framework_Error_Warning::$enabled = true;
+        return parent::tearDown();
+    }
+
     public function testFormat()
     {
         $formatter = new NormalizerFormatter('Y-m-d');
@@ -188,17 +194,100 @@ class NormalizerFormatterTest extends \PHPUnit_Framework_TestCase
      */
     public function testThrowsOnInvalidEncoding()
     {
+        if (version_compare(PHP_VERSION, '5.5.0', '<')) {
+            // Ignore the warning that will be emitted by PHP <5.5.0
+            \PHPUnit_Framework_Error_Warning::$enabled = false;
+        }
         $formatter = new NormalizerFormatter();
         $reflMethod = new \ReflectionMethod($formatter, 'toJson');
         $reflMethod->setAccessible(true);
 
-        // send an invalid unicode sequence
-        $res = $reflMethod->invoke($formatter, array('message' => "\xB1\x31"));
+        // send an invalid unicode sequence as a object that can't be cleaned
+        $record = new \stdClass;
+        $record->message = "\xB1\x31";
+        $res = $reflMethod->invoke($formatter, $record);
         if (PHP_VERSION_ID < 50500 && $res === '{"message":null}') {
             throw new \RuntimeException('PHP 5.3/5.4 throw a warning and null the value instead of returning false entirely');
         }
     }
 
+    public function testConvertsInvalidEncodingAsLatin9()
+    {
+        if (version_compare(PHP_VERSION, '5.5.0', '<')) {
+            // Ignore the warning that will be emitted by PHP <5.5.0
+            \PHPUnit_Framework_Error_Warning::$enabled = false;
+        }
+        $formatter = new NormalizerFormatter();
+        $reflMethod = new \ReflectionMethod($formatter, 'toJson');
+        $reflMethod->setAccessible(true);
+
+        $res = $reflMethod->invoke($formatter, array('message' => "\xA4\xA6\xA8\xB4\xB8\xBC\xBD\xBE"));
+
+        if (version_compare(PHP_VERSION, '5.5.0', '>=')) {
+            $this->assertSame('{"message":"€ŠšŽžŒœŸ"}', $res);
+        } else {
+            // PHP <5.5 does not return false for an element encoding failure,
+            // instead it emits a warning (possibly) and nulls the value.
+            $this->assertSame('{"message":null}', $res);
+        }
+    }
+
+    /**
+     * @param mixed $in Input
+     * @param mixed $expect Expected output
+     * @covers Monolog\Formatter\NormalizerFormatter::detectAndCleanUtf8
+     * @dataProvider providesDetectAndCleanUtf8
+     */
+    public function testDetectAndCleanUtf8($in, $expect)
+    {
+        $formatter = new NormalizerFormatter();
+        $formatter->detectAndCleanUtf8($in);
+        $this->assertSame($expect, $in);
+    }
+
+    public function providesDetectAndCleanUtf8()
+    {
+        $obj = new \stdClass;
+        return array(
+            'null' => array(null, null),
+            'int' => array(123, 123),
+            'float' => array(123.45, 123.45),
+            'bool false' => array(false, false),
+            'bool true' => array(true, true),
+            'ascii string' => array('abcdef', 'abcdef'),
+            'latin9 string' => array("\xB1\x31\xA4\xA6\xA8\xB4\xB8\xBC\xBD\xBE\xFF", '±1€ŠšŽžŒœŸÿ'),
+            'unicode string' => array('¤¦¨´¸¼½¾€ŠšŽžŒœŸ', '¤¦¨´¸¼½¾€ŠšŽžŒœŸ'),
+            'empty array' => array(array(), array()),
+            'array' => array(array('abcdef'), array('abcdef')),
+            'object' => array($obj, $obj),
+        );
+    }
+
+    /**
+     * @param int $code
+     * @param string $msg
+     * @dataProvider providesHandleJsonErrorFailure
+     */
+    public function testHandleJsonErrorFailure($code, $msg)
+    {
+        $formatter = new NormalizerFormatter();
+        $reflMethod = new \ReflectionMethod($formatter, 'handleJsonError');
+        $reflMethod->setAccessible(true);
+
+        $this->setExpectedException('RuntimeException', $msg);
+        $reflMethod->invoke($formatter, $code, 'faked');
+    }
+
+    public function providesHandleJsonErrorFailure()
+    {
+        return array(
+            'depth' => array(JSON_ERROR_DEPTH, 'Maximum stack depth exceeded'),
+            'state' => array(JSON_ERROR_STATE_MISMATCH, 'Underflow or the modes mismatch'),
+            'ctrl' => array(JSON_ERROR_CTRL_CHAR, 'Unexpected control character found'),
+            'default' => array(-1, 'Unknown error'),
+        );
+    }
+
     public function testExceptionTraceWithArgs()
     {
         if (defined('HHVM_VERSION')) {
@@ -284,4 +373,4 @@ class TestToStringError
     {
         throw new \RuntimeException('Could not convert to string');
     }
-}
+}