Add more charsets and parsing stuff

2018-08-11 22:47:27 +03:00 · 2018-08-11 22:47:27 +03:00 · d8137738b5
parent 1b44aae5ce
commit d8137738b5
3 changed files with 93 additions and 95 deletions
--- a/melib/src/error.rs
+++ b/melib/src/error.rs
@ -28,6 +28,7 @@ use std::error::Error;
 use std::fmt;
 use std::io;
 use std::result;
+use std::string;

 use nom;

@ -82,6 +83,13 @@ impl<'a> From<Cow<'a, str>> for MeliError {
    }
 }

+impl From<string::FromUtf8Error> for MeliError {
+    #[inline]
+    fn from(kind: string::FromUtf8Error) -> MeliError {
+        MeliError::new(format!("{:?}", kind))
+    }
+}
+
 //use std::option;
 //impl From<option::NoneError> for MeliError {
 //    #[inline]
--- a/melib/src/mailbox/email/attachment_types.rs
+++ b/melib/src/mailbox/email/attachment_types.rs
@ -1,4 +1,6 @@
+use mailbox::email::parser::BytesExt;
 use std::fmt::{Display, Formatter, Result as FmtResult};
+use std::str;

 #[derive(Clone, Copy, Debug, PartialEq)]
 pub enum Charset {
@ -8,10 +10,14 @@ pub enum Charset {
    ISO8859_1,
    ISO8859_2,
    ISO8859_7,
+    ISO8859_15,
+    Windows1251,
    Windows1252,
    Windows1253,
    GBK,
    GB2312,
+    BIG5,
+    ISO2022JP,
 }

 impl Default for Charset {
@ -23,18 +29,25 @@ impl Default for Charset {
 impl<'a> From<&'a [u8]> for Charset {
    fn from(b: &'a [u8]) -> Self {
        // TODO: Case insensitivity
-        match b {
+        match b.trim() {
            b"us-ascii" | b"ascii" | b"US-ASCII" => Charset::Ascii,
            b"utf-8" | b"UTF-8" => Charset::UTF8,
            b"utf-16" | b"UTF-16" => Charset::UTF16,
            b"iso-8859-1" | b"ISO-8859-1" => Charset::ISO8859_1,
            b"iso-8859-2" | b"ISO-8859-2" => Charset::ISO8859_2,
            b"iso-8859-7" | b"ISO-8859-7" => Charset::ISO8859_7,
+            b"iso-8859-15" | b"ISO-8859-15" => Charset::ISO8859_15,
+            b"windows-1251" | b"Windows-1251" => Charset::Windows1251,
            b"windows-1252" | b"Windows-1252" => Charset::Windows1252,
            b"windows-1253" | b"Windows-1253" => Charset::Windows1253,
            b"GBK" | b"gbk" => Charset::GBK,
            b"gb2312" | b"GB2312" => Charset::GB2312,
-            _ => Charset::Ascii,
+            b"BIG5" | b"big5" => Charset::BIG5,
+            b"ISO-2022-JP" | b"iso-2022-JP" => Charset::ISO2022JP,
+            _ => {
+                eprintln!("unknown tag is {:?}", str::from_utf8(b));
+                Charset::Ascii
+            },
        }
    }
 }
--- a/melib/src/mailbox/email/parser.rs
+++ b/melib/src/mailbox/email/parser.rs
@ -23,10 +23,8 @@ use chrono;
 use data_encoding::BASE64_MIME;
 use encoding::{DecoderTrap, Encoding};
 use nom::{is_hex_digit, le_u8};
-use nom::{Compare, CompareResult};
 use nom::{ErrorKind, IResult, Needed};
 use std;
-use std::str::from_utf8;

 macro_rules! is_whitespace {
    ($var:ident) => {
@ -166,7 +164,7 @@ named!(pub attachment<(std::vec::Vec<(&[u8], &[u8])>, &[u8])>,

 /* TODO: make a map of encodings and decoding functions so that they can be reused and easily
 * extended */
-use encoding::all::{ISO_8859_1, ISO_8859_2, ISO_8859_7, WINDOWS_1252, WINDOWS_1253, GBK};
+use encoding::all::*;

 fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
    if input.len() < 5 {
@ -174,107 +172,86 @@ fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
    } else if input[0] != b'=' || input[1] != b'?' {
        return IResult::Error(error_code!(ErrorKind::Custom(43)));
    }
-    for tag in &[
-        "UTF-8",
-        "iso-8859-7",
-        "windows-1253",
-        "iso-8859-1",
-        "iso-8859-2",
-        "gbk",
-    ] {
-        if let CompareResult::Ok = (&input[2..]).compare_no_case(*tag) {
-            let tag_len = tag.len();
-            /* tag must end with ?_? where _ is either Q or B, eg: =?UTF-8?B? */
-            if input[2 + tag_len] != b'?' || input[2 + tag_len + 2] != b'?' {
-                return IResult::Error(error_code!(ErrorKind::Custom(43)));
-            }
-            /* See if input ends with "?=" and get ending index */
-            let mut encoded_idx = None;
-            for i in (5 + tag_len)..input.len() {
-                if input[i] == b'?' && i < input.len() && input[i + 1] == b'=' {
-                    encoded_idx = Some(i);
-                    break;
-                }
-            }
-            if encoded_idx.is_none() {
-                return IResult::Error(error_code!(ErrorKind::Custom(43)));
-            }
-            let encoded = &input[5 + tag_len..encoded_idx.unwrap()];
-
-            let s: Vec<u8> = match input[2 + tag_len + 1] {
-                b'b' | b'B' => match BASE64_MIME.decode(encoded) {
-                    Ok(v) => v,
-                    Err(_) => encoded.to_vec(),
-                },
-                b'q' | b'Q' => match quoted_printable_bytes_header(encoded) {
-                    IResult::Done(b"", s) => s,
-                    _ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
-                },
-                _ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
-            };
-
-            match *tag {
-                "UTF-8" => {
-                    return IResult::Done(&input[encoded_idx.unwrap() + 2..], s);
-                }
-                "iso-8859-7" => {
-                    return if let Ok(v) = ISO_8859_7.decode(&s, DecoderTrap::Strict) {
-                        IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
-                    } else {
-                        IResult::Error(error_code!(ErrorKind::Custom(43)))
-                    }
-                }
-                "windows-1253" => {
-                    return if let Ok(v) = WINDOWS_1253.decode(&s, DecoderTrap::Strict) {
-                        IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
-                    } else {
-                        IResult::Error(error_code!(ErrorKind::Custom(43)))
-                    }
-                }
-                "iso-8859-1" => {
-                    return if let Ok(v) = ISO_8859_1.decode(&s, DecoderTrap::Strict) {
-                        IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
-                    } else {
-                        IResult::Error(error_code!(ErrorKind::Custom(43)))
-                    }
-                }
-                "iso-8859-2" => {
-                    return if let Ok(v) = ISO_8859_2.decode(&s, DecoderTrap::Strict) {
-                        IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
-                    } else {
-                        IResult::Error(error_code!(ErrorKind::Custom(43)))
-                    }
-                }
-                "gbk" => {
-                    return if let Ok(v) = GBK.decode(&s, DecoderTrap::Strict) {
-                        IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
-                    } else {
-                        IResult::Error(error_code!(ErrorKind::Custom(43)))
-                    }
-                }
-                _ => {
-                    panic!();
-                }
-            }
-        } else {
-            continue;
+    /* find end of Charset tag:
+     * =?charset?encoding?encoded text?=
+     * ---------^
+     */
+    let mut tag_end_idx = None;
+    for (idx, b) in input[2..].iter().enumerate() {
+        if *b == b'?' {
+            tag_end_idx = Some(idx + 2);
+            break;
+        }
+    }
+    if tag_end_idx.is_none() {
+        return IResult::Error(error_code!(ErrorKind::Custom(43)));
+    }
+    let tag_end_idx = tag_end_idx.unwrap();
+
+
+    if input[2 + tag_end_idx] != b'?' {
+        return IResult::Error(error_code!(ErrorKind::Custom(43)));
+    }
+    /* See if input ends with "?=" and get ending index
+     * =?charset?encoding?encoded text?=
+     * -------------------------------^
+     */
+    let mut encoded_end_idx = None;
+    for i in (3 + tag_end_idx)..input.len() {
+        if input[i] == b'?' && i < input.len() && input[i + 1] == b'=' {
+            encoded_end_idx = Some(i);
+            break;
+        }
+    }
+    if encoded_end_idx.is_none() {
+        return IResult::Error(error_code!(ErrorKind::Custom(43)));
+    }
+    let encoded_end_idx = encoded_end_idx.unwrap();
+    let encoded_text = &input[3 + tag_end_idx..encoded_end_idx];
+
+    let s: Vec<u8> = match input[tag_end_idx + 1] {
+        b'b' | b'B' => match BASE64_MIME.decode(encoded_text) {
+            Ok(v) => v,
+            Err(_) => encoded_text.to_vec(),
+        },
+        b'q' | b'Q' => match quoted_printable_bytes_header(encoded_text) {
+            IResult::Done(b"", s) => s,
+            _ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
+        },
+        _ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
+    };
+
+
+    let charset = Charset::from(&input[2..tag_end_idx]);
+
+    if let Charset::UTF8 = charset {
+        IResult::Done(&input[encoded_end_idx + 2..], s)
+    } else {
+        match decode_charset(&s, charset) {
+            Ok(v) => {
+                IResult::Done(&input[encoded_end_idx + 2..], v.into_bytes())
+            },
+            _ =>   IResult::Error(error_code!(ErrorKind::Custom(43))),
        }
    }
-    eprintln!("unknown tag is {:?}", from_utf8(&input[2..20]));
-    IResult::Error(error_code!(ErrorKind::Custom(43)))
 }

 pub fn decode_charset(s: &[u8], charset: Charset) -> Result<String> {
    match charset {
-        Charset::UTF8 | Charset::Ascii => Ok(String::from_utf8(s.to_vec()).unwrap()),
-        Charset::ISO8859_7 => Ok(ISO_8859_7.decode(s, DecoderTrap::Strict)?),
+        Charset::UTF8 | Charset::Ascii => Ok(String::from_utf8_lossy(s).to_string()),
        Charset::ISO8859_1 => Ok(ISO_8859_1.decode(s, DecoderTrap::Strict)?),
        Charset::ISO8859_2 => Ok(ISO_8859_2.decode(s, DecoderTrap::Strict)?),
+        Charset::ISO8859_7 => Ok(ISO_8859_7.decode(s, DecoderTrap::Strict)?),
+        Charset::ISO8859_15 => Ok(ISO_8859_15.decode(s, DecoderTrap::Strict)?),
        Charset::GBK => Ok(GBK.decode(s, DecoderTrap::Strict)?),
+        Charset::Windows1251 => Ok(WINDOWS_1251.decode(s, DecoderTrap::Strict)?),
        Charset::Windows1252 => Ok(WINDOWS_1252.decode(s, DecoderTrap::Strict)?),
        Charset::Windows1253 => Ok(WINDOWS_1253.decode(s, DecoderTrap::Strict)?),
-        Charset::GB2312 => unimplemented!(),
-        Charset::UTF16 => unimplemented!(),
+        // Unimplemented:
+        Charset::GB2312 => Ok(String::from_utf8_lossy(s).to_string()),
+        Charset::UTF16 => Ok(String::from_utf8_lossy(s).to_string()),
+        Charset::BIG5 => Ok(String::from_utf8_lossy(s).to_string()),
+        Charset::ISO2022JP => Ok(String::from_utf8_lossy(s).to_string()),
    }
 }