melib/text: vendor escape8259 crate into module

Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is>
2023-12-26 17:15:40 +02:00 · 2023-12-26 17:15:40 +02:00 · 66678637b5
parent 808aa4942d
commit 66678637b5
2 changed files with 340 additions and 0 deletions
--- a/melib/src/text/escape.rs
+++ b/melib/src/text/escape.rs
@ -0,0 +1,339 @@
+// SPDX-License-Identifier: MIT
+//
+// Copyright (c) 2020 Eric Seppanen <eds@reric.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice (including the next
+// paragraph) shall be included in all copies or substantial portions of the
+// Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//! `escape` performs RFC8259-compliant string escaping and un-escaping.
+//!
+//! [RFC8259] is a JSON encoding standard.  Many JSON encoders exist, but other
+//! RFCs use the same string escaping mechanism, so it's useful to be able to
+//! access the string escaping functions by themselves.
+//!
+//! Note: this code has been copied directly from the `escape8259` crate hosted on this URL: <https://github.com/ericseppanen/escape8259>
+//!
+//! # Examples
+//!
+//! ```rust
+//! use melib::text::escape::{escape, unescape};
+//!
+//! let s = "A null (\0) and a double-quote (\")";
+//! assert_eq!(escape(s), r#"A null (\u0000) and a double-quote (\")"#);
+//!
+//! let crab = r#"This is a crab: \ud83e\udd80"#;
+//! assert_eq!(unescape(crab).unwrap(), "This is a crab: 🦀");
+//!
+//! // We accept encodings that weren't really necessary.
+//! assert_eq!(unescape(r#"\u0041\n"#).unwrap(), "A\n");
+//!
+//! let multiline = r#"hello
+//!  world"#;
+//! assert_eq!(escape(multiline), r#"hello\n world"#);
+//! ```
+//!
+//! [RFC8259]: https://tools.ietf.org/html/rfc8259
+
+#![warn(missing_docs)]
+#![forbid(unsafe_code)]
+
+use std::{
+    char::decode_utf16,
+    fmt::{Display, Write},
+};
+
+/// An error occurred while unescaping.
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct UnescapeError;
+
+impl Display for UnescapeError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str("failed rfc8259 unescape")
+    }
+}
+
+impl std::error::Error for UnescapeError {}
+
+type UnescapeResult<T> = Result<T, UnescapeError>;
+
+/// Used to collect output characters and queue u16 values for translation.
+struct UnescapeState {
+    /// The accumulated characters
+    out: String,
+    /// Store a fragment of a large character for later decoding
+    stash: u16,
+}
+
+impl UnescapeState {
+    fn new() -> Self {
+        Self {
+            out: String::new(),
+            stash: 0,
+        }
+    }
+
+    /// Collect a new character
+    fn push_char(&mut self, c: char) -> UnescapeResult<()> {
+        if self.stash != 0 {
+            return Err(UnescapeError);
+        }
+        self.out.push(c);
+        Ok(())
+    }
+
+    /// Collect a new UTF16 word.  This can either be one whole character,
+    /// or part of a larger character.
+    fn push_u16(&mut self, x: u16) -> UnescapeResult<()> {
+        let surrogate = (0xD800..=0xDFFF).contains(&x);
+        match (self.stash, surrogate) {
+            (0, false) => {
+                // The std library only provides utf16 decode of an iterator,
+                // so to decode a single character we wrap it in an array.
+                // Hopefully the compiler will elide most of this extra work.
+                let words = [x];
+                match decode_utf16(words.iter().copied()).next() {
+                    Some(Ok(c)) => {
+                        self.out.push(c);
+                    }
+                    _ => return Err(UnescapeError),
+                }
+            }
+            (0, true) => self.stash = x,
+            (_, false) => {
+                return Err(UnescapeError);
+            }
+            (w, true) => {
+                let words = [w, x];
+                match decode_utf16(words.iter().copied()).next() {
+                    Some(Ok(c)) => {
+                        self.out.push(c);
+                        self.stash = 0;
+                    }
+                    _ => return Err(UnescapeError),
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// If we queued up part of a UTF-16 encoded word but didn't
+    /// finish it, return an error.  Otherwise, consume self and
+    /// return the accumulated String.
+    fn finalize(self) -> UnescapeResult<String> {
+        if self.stash != 0 {
+            return Err(UnescapeError);
+        }
+        Ok(self.out)
+    }
+}
+
+fn parse_u16<S>(s: &mut S) -> UnescapeResult<u16>
+where
+    S: Iterator<Item = char>,
+{
+    // Placeholder character in case the input doesn't have the 4 chars we want.
+    let placeholders = std::iter::repeat('\0');
+    let hexnum: String = s.chain(placeholders).take(4).collect();
+    u16::from_str_radix(&hexnum, 16).map_err(|_| UnescapeError)
+}
+
+/// RFC8259 says non-escaped characters must be in one of the following ranges:
+/// %x20-21 / %x23-5B / %x5D-10FFFF
+fn is_safe_char(c: char) -> bool {
+    let safe_ranges = [(0x20..=0x21), (0x23..=0x5B), (0x5D..=0x10FFFF)];
+    let cv = c as u32;
+
+    safe_ranges.iter().any(|range| range.contains(&cv))
+}
+
+/// Un-escape a string, following RFC8259 rules.
+///
+/// The only allowed single-character escapes are:
+/// `\" \\ \/ /b /f /n /r /t`
+///
+/// Any other character may be escaped in UTF-16 form:
+/// `\uXXXX` or `\uXXXX\uXXXX`
+///
+/// Characters in the ranges `0x20-21`, `0x23-5B`, `0x5D-10FFFF`
+/// may appear un-escaped in the input.
+#[inline]
+pub fn unescape_rfc8259<S>(s: S) -> UnescapeResult<String>
+where
+    S: AsRef<str>,
+{
+    unescape_inner(s.as_ref())
+}
+
+fn unescape_inner(s: &str) -> UnescapeResult<String> {
+    let mut state = UnescapeState::new();
+    let mut ins = s.chars();
+
+    while let Some(c) = ins.next() {
+        if c == '\\' {
+            match ins.next() {
+                None => {
+                    return Err(UnescapeError);
+                }
+                Some(d) => {
+                    match d {
+                        '"' | '\\' | '/' => state.push_char(d)?,
+                        'b' => state.push_char('\x08')?, // backspace
+                        'f' => state.push_char('\x0C')?, // formfeed
+                        'n' => state.push_char('\n')?,   // linefeed
+                        'r' => state.push_char('\r')?,   // carriage return
+                        't' => state.push_char('\t')?,   // tab
+                        'u' => {
+                            let val = parse_u16(&mut ins)?;
+                            state.push_u16(val)?;
+                        }
+                        _ => {
+                            return Err(UnescapeError);
+                        }
+                    }
+                }
+            }
+        } else if is_safe_char(c) {
+            state.push_char(c)?;
+        } else {
+            return Err(UnescapeError);
+        }
+    }
+
+    state.finalize()
+}
+
+// %x22 /          ; "    quotation mark  U+0022
+// %x5C /          ; \    reverse solidus U+005C
+// %x2F /          ; /    solidus         U+002F
+// %x62 /          ; b    backspace       U+0008
+// %x66 /          ; f    form feed       U+000C
+// %x6E /          ; n    line feed       U+000A
+// %x72 /          ; r    carriage return U+000D
+// %x74 /          ; t    tab             U+0009
+// %x75 4HEXDIG )  ; uXXXX                U+XXXX
+
+fn force_escape(c: char, out: &mut String) {
+    let c = c as u32;
+    match c {
+        0x08 => out.push_str("\\b"),
+        0x09 => out.push_str("\\t"),
+        0x0A => out.push_str("\\n"),
+        0x0C => out.push_str("\\f"),
+        0x0D => out.push_str("\\r"),
+        0x22 => out.push_str("\\\""),
+        0x5C => out.push_str("\\\\"),
+        _ => {
+            // // RFC8259 allows unicode characters natively, so there is no need
+            // // to convert everything into \uXXXX form.  The only thing that's
+            // // required to use that form are the ASCII control characters,
+            // // which will never require more than one \uXXXX value.
+            // if c >= 0x20 {
+            //     panic!("force_escape unnecessary encoding requested");
+            // }
+            write!(out, "\\u{:04x}", c).unwrap();
+        }
+    }
+}
+
+/// Escape a string, following RFC8259 rules.
+///
+/// Only characters that require escaping will be escaped:
+/// quotation mark `?`,
+/// reverse solidus `\` (backslash),
+/// and the control characters (`0x00-1F`).
+#[inline]
+pub fn escape_rfc8259<S>(s: S) -> String
+where
+    S: AsRef<str>,
+{
+    escape_inner(s.as_ref())
+}
+
+fn escape_inner(s: &str) -> String {
+    let mut out = String::new();
+    for c in s.chars() {
+        if is_safe_char(c) {
+            out.push(c);
+        } else {
+            force_escape(c, &mut out);
+        }
+    }
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn assert_round_trip(s: &str) {
+        assert_eq!(s, unescape_rfc8259(escape_rfc8259(s)).unwrap());
+    }
+
+    #[test]
+    fn test_round_trip() {
+        assert_round_trip("abc");
+        assert_round_trip("\n\r\t\x08\x0C\x00");
+        assert_round_trip(r#"\"#);
+        assert_round_trip(r#"""#);
+        assert_round_trip("Σ𝄞");
+        assert_round_trip(r#"\𝄞"#);
+        assert_round_trip(r#"(╯°□°）╯︵ ┻━┻"#);
+    }
+
+    #[test]
+    fn test_escape() {
+        assert_eq!(escape_rfc8259("\0"), r#"\u0000"#);
+        assert_eq!(escape_rfc8259("\n"), r#"\n"#);
+        assert_eq!(escape_rfc8259(r#"\"#), r#"\\"#);
+        assert_eq!(escape_rfc8259(r#"""#), r#"\""#);
+        assert_eq!(escape_rfc8259("Σ"), "Σ"); // U+03A3
+        assert_eq!(escape_rfc8259("𝄞"), "𝄞"); // U+1D11E
+    }
+
+    #[test]
+    fn test_unescape() {
+        assert_eq!(unescape_rfc8259(r#"abc"#), Ok("abc".into()));
+        assert_eq!(unescape_rfc8259(r#"ab\nc"#), Ok("ab\nc".into()));
+        assert_eq!(unescape_rfc8259(r#"ab\zc"#), Err(UnescapeError));
+        assert_eq!(unescape_rfc8259(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
+        assert_eq!(unescape_rfc8259(r#"𝄞"#), Ok("𝄞".into()));
+        assert_eq!(unescape_rfc8259(r#"\𝄞"#), Err(UnescapeError));
+        assert_eq!(unescape_rfc8259(r#"\uD834\uDD1E"#), Ok("𝄞".into()));
+        assert_eq!(unescape_rfc8259(r#"\uD834"#), Err(UnescapeError));
+        assert_eq!(unescape_rfc8259(r#"\uDD1E"#), Err(UnescapeError));
+        assert_eq!(unescape_rfc8259("\t"), Err(UnescapeError));
+    }
+
+    #[test]
+    fn test_generic_asref() {
+        assert_eq!(escape_rfc8259("\n"), r#"\n"#);
+        assert_eq!(escape_rfc8259(String::from("\n")), r#"\n"#);
+        assert_eq!(escape_rfc8259(String::from("\n")), r#"\n"#);
+
+        assert_eq!(unescape_rfc8259("abc"), Ok("abc".into()));
+        assert_eq!(unescape_rfc8259(String::from("abc")), Ok("abc".into()));
+        assert_eq!(unescape_rfc8259(String::from("abc")), Ok("abc".into()));
+    }
+
+    #[test]
+    fn test_error_impl() {
+        // This won't compile if UnescapeError doesn't impl Display + Error.
+        let e = UnescapeError;
+        let _x: Box<dyn std::error::Error> = e.into();
+    }
+}
--- a/melib/src/text/mod.rs
+++ b/melib/src/text/mod.rs
@ -19,6 +19,7 @@
 * along with meli. If not, see <http://www.gnu.org/licenses/>.
 */

+pub mod escape;
 pub mod grapheme_clusters;
 pub mod line_break;
 pub mod search;