From 096c2970b3242cf4c6ec33a28f95d3d290ff511a Mon Sep 17 00:00:00 2001
From: Manos Pitsidianakis <el13635@mail.ntua.gr>
Date: Wed, 16 Sep 2020 13:12:17 +0300
Subject: [PATCH] melib/email/parser: impl RFC6532

RFC6532 International Mail Headers
---
 melib/src/email/parser.rs | 152 +++++++++++++++++++++++++++++---------
 1 file changed, 119 insertions(+), 33 deletions(-)

diff --git a/melib/src/email/parser.rs b/melib/src/email/parser.rs
index 4eee76106..be9407837 100644
--- a/melib/src/email/parser.rs
+++ b/melib/src/email/parser.rs
@@ -506,22 +506,111 @@ pub mod dates {
 
 pub mod generic {
     use super::*;
+    fn byte_in_range<'a>(a: u8, b: u8) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], u8> {
+        move |input| {
+            if input.is_empty() {
+                return Err(nom::Err::Error((input, "empty input").into()));
+            }
+            if input[0] >= a && input[0] <= b {
+                Ok((&input[1..], input[0]))
+            } else {
+                Err(nom::Err::Error((input, "out of range").into()))
+            }
+        }
+    }
+
+    ///UTF-8 characters can be defined in terms of octets using the
+    ///following ABNF [RFC5234], taken from [RFC3629]:
+    ///UTF8-non-ascii  =   UTF8-2 / UTF8-3 / UTF8-4
+    fn utf8_non_ascii(input: &[u8]) -> IResult<&[u8], Cow<'_, [u8]>> {
+        /// UTF8-2      = %xC2-DF UTF8-tail
+        fn utf8_2(input: &[u8]) -> IResult<&[u8], &[u8]> {
+            let (rest, _) = byte_in_range(0xc2, 0xdf)(input)?;
+            let (rest, _) = utf8_tail(rest)?;
+            Ok((rest, &input[0..2]))
+        }
+        /// UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
+        fn utf8_3<'a>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8]> {
+            alt((
+                |input: &'a [u8]| -> IResult<&'a [u8], &'a [u8]> {
+                    let (rest, _) = byte_in_range(0xe0, 0xe0)(input)?;
+                    let (rest, _) = byte_in_range(0xa0, 0xbf)(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    Ok((rest, &input[0..3]))
+                },
+                |input: &'a [u8]| -> IResult<&'a [u8], &'a [u8]> {
+                    let (rest, _) = byte_in_range(0xe1, 0xec)(input)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    Ok((rest, &input[0..3]))
+                },
+                |input: &'a [u8]| -> IResult<&'a [u8], &'a [u8]> {
+                    let (rest, _) = byte_in_range(0xed, 0xed)(input)?;
+                    let (rest, _) = byte_in_range(0x80, 0x9f)(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    Ok((rest, &input[0..3]))
+                },
+                |input: &'a [u8]| -> IResult<&'a [u8], &'a [u8]> {
+                    let (rest, _) = byte_in_range(0xee, 0xef)(input)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    Ok((rest, &input[0..3]))
+                },
+            ))(input)
+        }
+        /// UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / %xF4 %x80-8F 2( UTF8-tail )
+        fn utf8_4<'a>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8]> {
+            alt((
+                |input: &'a [u8]| -> IResult<&'a [u8], &'a [u8]> {
+                    let (rest, _) = byte_in_range(0xf0, 0xf0)(input)?;
+                    let (rest, _) = byte_in_range(0x90, 0xbf)(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    Ok((rest, &input[0..4]))
+                },
+                |input: &'a [u8]| {
+                    let (rest, _) = byte_in_range(0xf1, 0xf3)(input)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    Ok((rest, &input[0..4]))
+                },
+                |input: &'a [u8]| {
+                    let (rest, _) = byte_in_range(0xf4, 0xf4)(input)?;
+                    let (rest, _) = byte_in_range(0x80, 0x8f)(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    let (rest, _) = utf8_tail(rest)?;
+                    Ok((rest, &input[0..4]))
+                },
+            ))(input)
+        }
+        ///  UTF8-tail   = %x80-BF
+        fn utf8_tail(input: &[u8]) -> IResult<&[u8], &[u8]> {
+            let (rest, _) = byte_in_range(0x80, 0xbf)(input)?;
+            Ok((rest, &input[0..1]))
+        }
+
+        let (rest, ret) = alt((utf8_2, utf8_3, utf8_4))(input)?;
+
+        Ok((rest, ret.into()))
+    }
+
     ///`%x21-7E`
+    /// RFC6532 adds: `VCHAR   =/  UTF8-non-ascii`
     fn vchar(input: &[u8]) -> IResult<&[u8], u8> {
-        if input.is_empty() {
-            return Err(nom::Err::Error((input, "vchar(): empty input").into()));
-        }
-        if input[0] >= 0x21 && input[0] <= 0x7e {
-            Ok((&input[1..], input[0]))
-        } else {
-            Err(nom::Err::Error((input, "vchar(): out of range").into()))
-        }
+        byte_in_range(0x21, 0x7e)(input)
     }
 
     ///`quoted-pair     =   ("\" (VCHAR / WSP)) / obs-qp`
     fn quoted_pair(input: &[u8]) -> IResult<&[u8], Cow<'_, [u8]>> {
-        let (input, byte) = preceded(tag("\\"), alt((vchar, wsp)))(input)?;
-        Ok((input, vec![byte].into()))
+        preceded(
+            tag("\\"),
+            alt((
+                utf8_non_ascii,
+                map(vchar, |byte| vec![byte].into()),
+                map(wsp, |byte| vec![byte].into()),
+            )),
+        )(input)
     }
 
     ///```text
@@ -531,17 +620,17 @@ pub mod generic {
     ///                     obs-ctext
     ///```
     fn ctext(input: &[u8]) -> IResult<&[u8], ()> {
-        if input.is_empty() {
-            return Err(nom::Err::Error((input, "ctext(): empty input").into()));
-        }
-        if (input[0] >= 33 && input[0] <= 39)
-            || (input[0] >= 42 && input[0] <= 91)
-            || (input[0] >= 93 && input[0] <= 126)
-        {
-            Ok((&input[1..], ()))
-        } else {
-            Err(nom::Err::Error((input, "ctext(): out of range").into()))
-        }
+        alt((
+            map(
+                alt((
+                    byte_in_range(33, 39),
+                    byte_in_range(42, 91),
+                    byte_in_range(93, 126),
+                )),
+                |_| (),
+            ),
+            map(utf8_non_ascii, |_| ()),
+        ))(input)
     }
 
     ///```text
@@ -967,7 +1056,7 @@ pub mod generic {
         let mut at_least_one = false;
         while let Ok((_input, atext_r)) = atext(input) {
             at_least_one = true;
-            ret.push(atext_r);
+            ret.extend_from_slice(&atext_r);
             input = _input;
         }
         if !at_least_one {
@@ -985,7 +1074,7 @@ pub mod generic {
             let mut at_least_one = false;
             while let Ok((_input, atext_r)) = atext(input) {
                 at_least_one = true;
-                ret.push(atext_r);
+                ret.extend_from_slice(&atext_r);
                 input = _input;
             }
             if !at_least_one {
@@ -998,7 +1087,7 @@ pub mod generic {
     }
 
     ///`atext           =   ALPHA / DIGIT /    ; Printable US-ASCII "!" / "#" /        ;  characters not including "$" / "%" /        ;  specials.  Used for atoms.  "&" / "'" / "*" / "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~"`
-    pub fn atext(input: &[u8]) -> IResult<&[u8], u8> {
+    pub fn atext_ascii(input: &[u8]) -> IResult<&[u8], Cow<'_, [u8]>> {
         if input.is_empty() {
             return Err(nom::Err::Error((input, "atext(): empty input").into()));
         }
@@ -1009,12 +1098,16 @@ pub mod generic {
             ]
             .contains(&input[0])
         {
-            Ok((&input[1..], input[0]))
+            Ok((&input[1..], input[0..1].into()))
         } else {
             return Err(nom::Err::Error((input, "atext(): invalid byte").into()));
         }
     }
 
+    pub fn atext(input: &[u8]) -> IResult<&[u8], Cow<'_, [u8]>> {
+        alt((atext_ascii, utf8_non_ascii))(input)
+    }
+
     ///dot-atom        =   [CFWS] dot-atom-text [CFWS]
     pub fn dot_atom(input: &[u8]) -> IResult<&[u8], Cow<'_, [u8]>> {
         let (input, _) = opt(cfws)(input)?;
@@ -1029,14 +1122,7 @@ pub mod generic {
     ///                    obs-dtext          ;  "[", "]", or "\"
     ///```
     pub fn dtext(input: &[u8]) -> IResult<&[u8], u8> {
-        if input.is_empty() {
-            return Err(nom::Err::Error((input, "dtext(): empty input").into()));
-        }
-        if (input[0] >= 33 && input[0] <= 90) || (input[0] > 94 && input[0] < 126) {
-            Ok((&input[1..], input[0]))
-        } else {
-            Err(nom::Err::Error((input, "dtext(): out of range").into()))
-        }
+        alt((byte_in_range(33, 90), byte_in_range(94, 125)))(input)
     }
 }