melib/text: vendor escape8259 crate into module
Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is>escape8259-vendor
parent
808aa4942d
commit
66678637b5
|
@ -0,0 +1,339 @@
|
|||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
// Copyright (c) 2020 Eric Seppanen <eds@reric.net>
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice (including the next
|
||||
// paragraph) shall be included in all copies or substantial portions of the
|
||||
// Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
//! `escape` performs RFC8259-compliant string escaping and un-escaping.
|
||||
//!
|
||||
//! [RFC8259] is a JSON encoding standard. Many JSON encoders exist, but other
|
||||
//! RFCs use the same string escaping mechanism, so it's useful to be able to
|
||||
//! access the string escaping functions by themselves.
|
||||
//!
|
||||
//! Note: this code has been copied directly from the `escape8259` crate hosted on this URL: <https://github.com/ericseppanen/escape8259>
|
||||
//!
|
||||
//! # Examples
|
||||
//!
|
||||
//! ```rust
|
||||
//! use melib::text::escape::{escape, unescape};
|
||||
//!
|
||||
//! let s = "A null (\0) and a double-quote (\")";
|
||||
//! assert_eq!(escape(s), r#"A null (\u0000) and a double-quote (\")"#);
|
||||
//!
|
||||
//! let crab = r#"This is a crab: \ud83e\udd80"#;
|
||||
//! assert_eq!(unescape(crab).unwrap(), "This is a crab: π¦");
|
||||
//!
|
||||
//! // We accept encodings that weren't really necessary.
|
||||
//! assert_eq!(unescape(r#"\u0041\n"#).unwrap(), "A\n");
|
||||
//!
|
||||
//! let multiline = r#"hello
|
||||
//! world"#;
|
||||
//! assert_eq!(escape(multiline), r#"hello\n world"#);
|
||||
//! ```
|
||||
//!
|
||||
//! [RFC8259]: https://tools.ietf.org/html/rfc8259
|
||||
|
||||
#![warn(missing_docs)]
|
||||
#![forbid(unsafe_code)]
|
||||
|
||||
use std::{
|
||||
char::decode_utf16,
|
||||
fmt::{Display, Write},
|
||||
};
|
||||
|
||||
/// An error occurred while unescaping.
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub struct UnescapeError;
|
||||
|
||||
impl Display for UnescapeError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str("failed rfc8259 unescape")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for UnescapeError {}
|
||||
|
||||
type UnescapeResult<T> = Result<T, UnescapeError>;
|
||||
|
||||
/// Used to collect output characters and queue u16 values for translation.
|
||||
struct UnescapeState {
|
||||
/// The accumulated characters
|
||||
out: String,
|
||||
/// Store a fragment of a large character for later decoding
|
||||
stash: u16,
|
||||
}
|
||||
|
||||
impl UnescapeState {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
out: String::new(),
|
||||
stash: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect a new character
|
||||
fn push_char(&mut self, c: char) -> UnescapeResult<()> {
|
||||
if self.stash != 0 {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
self.out.push(c);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect a new UTF16 word. This can either be one whole character,
|
||||
/// or part of a larger character.
|
||||
fn push_u16(&mut self, x: u16) -> UnescapeResult<()> {
|
||||
let surrogate = (0xD800..=0xDFFF).contains(&x);
|
||||
match (self.stash, surrogate) {
|
||||
(0, false) => {
|
||||
// The std library only provides utf16 decode of an iterator,
|
||||
// so to decode a single character we wrap it in an array.
|
||||
// Hopefully the compiler will elide most of this extra work.
|
||||
let words = [x];
|
||||
match decode_utf16(words.iter().copied()).next() {
|
||||
Some(Ok(c)) => {
|
||||
self.out.push(c);
|
||||
}
|
||||
_ => return Err(UnescapeError),
|
||||
}
|
||||
}
|
||||
(0, true) => self.stash = x,
|
||||
(_, false) => {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
(w, true) => {
|
||||
let words = [w, x];
|
||||
match decode_utf16(words.iter().copied()).next() {
|
||||
Some(Ok(c)) => {
|
||||
self.out.push(c);
|
||||
self.stash = 0;
|
||||
}
|
||||
_ => return Err(UnescapeError),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// If we queued up part of a UTF-16 encoded word but didn't
|
||||
/// finish it, return an error. Otherwise, consume self and
|
||||
/// return the accumulated String.
|
||||
fn finalize(self) -> UnescapeResult<String> {
|
||||
if self.stash != 0 {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
Ok(self.out)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_u16<S>(s: &mut S) -> UnescapeResult<u16>
|
||||
where
|
||||
S: Iterator<Item = char>,
|
||||
{
|
||||
// Placeholder character in case the input doesn't have the 4 chars we want.
|
||||
let placeholders = std::iter::repeat('\0');
|
||||
let hexnum: String = s.chain(placeholders).take(4).collect();
|
||||
u16::from_str_radix(&hexnum, 16).map_err(|_| UnescapeError)
|
||||
}
|
||||
|
||||
/// RFC8259 says non-escaped characters must be in one of the following ranges:
|
||||
/// %x20-21 / %x23-5B / %x5D-10FFFF
|
||||
fn is_safe_char(c: char) -> bool {
|
||||
let safe_ranges = [(0x20..=0x21), (0x23..=0x5B), (0x5D..=0x10FFFF)];
|
||||
let cv = c as u32;
|
||||
|
||||
safe_ranges.iter().any(|range| range.contains(&cv))
|
||||
}
|
||||
|
||||
/// Un-escape a string, following RFC8259 rules.
|
||||
///
|
||||
/// The only allowed single-character escapes are:
|
||||
/// `\" \\ \/ /b /f /n /r /t`
|
||||
///
|
||||
/// Any other character may be escaped in UTF-16 form:
|
||||
/// `\uXXXX` or `\uXXXX\uXXXX`
|
||||
///
|
||||
/// Characters in the ranges `0x20-21`, `0x23-5B`, `0x5D-10FFFF`
|
||||
/// may appear un-escaped in the input.
|
||||
#[inline]
|
||||
pub fn unescape_rfc8259<S>(s: S) -> UnescapeResult<String>
|
||||
where
|
||||
S: AsRef<str>,
|
||||
{
|
||||
unescape_inner(s.as_ref())
|
||||
}
|
||||
|
||||
fn unescape_inner(s: &str) -> UnescapeResult<String> {
|
||||
let mut state = UnescapeState::new();
|
||||
let mut ins = s.chars();
|
||||
|
||||
while let Some(c) = ins.next() {
|
||||
if c == '\\' {
|
||||
match ins.next() {
|
||||
None => {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
Some(d) => {
|
||||
match d {
|
||||
'"' | '\\' | '/' => state.push_char(d)?,
|
||||
'b' => state.push_char('\x08')?, // backspace
|
||||
'f' => state.push_char('\x0C')?, // formfeed
|
||||
'n' => state.push_char('\n')?, // linefeed
|
||||
'r' => state.push_char('\r')?, // carriage return
|
||||
't' => state.push_char('\t')?, // tab
|
||||
'u' => {
|
||||
let val = parse_u16(&mut ins)?;
|
||||
state.push_u16(val)?;
|
||||
}
|
||||
_ => {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if is_safe_char(c) {
|
||||
state.push_char(c)?;
|
||||
} else {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
}
|
||||
|
||||
state.finalize()
|
||||
}
|
||||
|
||||
// %x22 / ; " quotation mark U+0022
|
||||
// %x5C / ; \ reverse solidus U+005C
|
||||
// %x2F / ; / solidus U+002F
|
||||
// %x62 / ; b backspace U+0008
|
||||
// %x66 / ; f form feed U+000C
|
||||
// %x6E / ; n line feed U+000A
|
||||
// %x72 / ; r carriage return U+000D
|
||||
// %x74 / ; t tab U+0009
|
||||
// %x75 4HEXDIG ) ; uXXXX U+XXXX
|
||||
|
||||
fn force_escape(c: char, out: &mut String) {
|
||||
let c = c as u32;
|
||||
match c {
|
||||
0x08 => out.push_str("\\b"),
|
||||
0x09 => out.push_str("\\t"),
|
||||
0x0A => out.push_str("\\n"),
|
||||
0x0C => out.push_str("\\f"),
|
||||
0x0D => out.push_str("\\r"),
|
||||
0x22 => out.push_str("\\\""),
|
||||
0x5C => out.push_str("\\\\"),
|
||||
_ => {
|
||||
// // RFC8259 allows unicode characters natively, so there is no need
|
||||
// // to convert everything into \uXXXX form. The only thing that's
|
||||
// // required to use that form are the ASCII control characters,
|
||||
// // which will never require more than one \uXXXX value.
|
||||
// if c >= 0x20 {
|
||||
// panic!("force_escape unnecessary encoding requested");
|
||||
// }
|
||||
write!(out, "\\u{:04x}", c).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Escape a string, following RFC8259 rules.
|
||||
///
|
||||
/// Only characters that require escaping will be escaped:
|
||||
/// quotation mark `?`,
|
||||
/// reverse solidus `\` (backslash),
|
||||
/// and the control characters (`0x00-1F`).
|
||||
#[inline]
|
||||
pub fn escape_rfc8259<S>(s: S) -> String
|
||||
where
|
||||
S: AsRef<str>,
|
||||
{
|
||||
escape_inner(s.as_ref())
|
||||
}
|
||||
|
||||
fn escape_inner(s: &str) -> String {
|
||||
let mut out = String::new();
|
||||
for c in s.chars() {
|
||||
if is_safe_char(c) {
|
||||
out.push(c);
|
||||
} else {
|
||||
force_escape(c, &mut out);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn assert_round_trip(s: &str) {
|
||||
assert_eq!(s, unescape_rfc8259(escape_rfc8259(s)).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_round_trip() {
|
||||
assert_round_trip("abc");
|
||||
assert_round_trip("\n\r\t\x08\x0C\x00");
|
||||
assert_round_trip(r#"\"#);
|
||||
assert_round_trip(r#"""#);
|
||||
assert_round_trip("Ξ£π");
|
||||
assert_round_trip(r#"\π"#);
|
||||
assert_round_trip(r#"(β―Β°β‘Β°οΌβ―οΈ΅ β»ββ»"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape() {
|
||||
assert_eq!(escape_rfc8259("\0"), r#"\u0000"#);
|
||||
assert_eq!(escape_rfc8259("\n"), r#"\n"#);
|
||||
assert_eq!(escape_rfc8259(r#"\"#), r#"\\"#);
|
||||
assert_eq!(escape_rfc8259(r#"""#), r#"\""#);
|
||||
assert_eq!(escape_rfc8259("Ξ£"), "Ξ£"); // U+03A3
|
||||
assert_eq!(escape_rfc8259("π"), "π"); // U+1D11E
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unescape() {
|
||||
assert_eq!(unescape_rfc8259(r#"abc"#), Ok("abc".into()));
|
||||
assert_eq!(unescape_rfc8259(r#"ab\nc"#), Ok("ab\nc".into()));
|
||||
assert_eq!(unescape_rfc8259(r#"ab\zc"#), Err(UnescapeError));
|
||||
assert_eq!(unescape_rfc8259(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
|
||||
assert_eq!(unescape_rfc8259(r#"π"#), Ok("π".into()));
|
||||
assert_eq!(unescape_rfc8259(r#"\π"#), Err(UnescapeError));
|
||||
assert_eq!(unescape_rfc8259(r#"\uD834\uDD1E"#), Ok("π".into()));
|
||||
assert_eq!(unescape_rfc8259(r#"\uD834"#), Err(UnescapeError));
|
||||
assert_eq!(unescape_rfc8259(r#"\uDD1E"#), Err(UnescapeError));
|
||||
assert_eq!(unescape_rfc8259("\t"), Err(UnescapeError));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generic_asref() {
|
||||
assert_eq!(escape_rfc8259("\n"), r#"\n"#);
|
||||
assert_eq!(escape_rfc8259(String::from("\n")), r#"\n"#);
|
||||
assert_eq!(escape_rfc8259(String::from("\n")), r#"\n"#);
|
||||
|
||||
assert_eq!(unescape_rfc8259("abc"), Ok("abc".into()));
|
||||
assert_eq!(unescape_rfc8259(String::from("abc")), Ok("abc".into()));
|
||||
assert_eq!(unescape_rfc8259(String::from("abc")), Ok("abc".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_impl() {
|
||||
// This won't compile if UnescapeError doesn't impl Display + Error.
|
||||
let e = UnescapeError;
|
||||
let _x: Box<dyn std::error::Error> = e.into();
|
||||
}
|
||||
}
|
|
@ -19,6 +19,7 @@
|
|||
* along with meli. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
pub mod escape;
|
||||
pub mod grapheme_clusters;
|
||||
pub mod line_break;
|
||||
pub mod search;
|
||||
|
|
Loadingβ¦
Reference in New Issue