From d8137738b550c78d87592c784c5bcedb54de61aa Mon Sep 17 00:00:00 2001 From: Manos Pitsidianakis Date: Sat, 11 Aug 2018 22:47:27 +0300 Subject: [PATCH] Add more charsets and parsing stuff --- melib/src/error.rs | 8 + melib/src/mailbox/email/attachment_types.rs | 17 +- melib/src/mailbox/email/parser.rs | 163 +++++++++----------- 3 files changed, 93 insertions(+), 95 deletions(-) diff --git a/melib/src/error.rs b/melib/src/error.rs index 1bdd9021..432325ce 100644 --- a/melib/src/error.rs +++ b/melib/src/error.rs @@ -28,6 +28,7 @@ use std::error::Error; use std::fmt; use std::io; use std::result; +use std::string; use nom; @@ -82,6 +83,13 @@ impl<'a> From> for MeliError { } } +impl From for MeliError { + #[inline] + fn from(kind: string::FromUtf8Error) -> MeliError { + MeliError::new(format!("{:?}", kind)) + } +} + //use std::option; //impl From for MeliError { // #[inline] diff --git a/melib/src/mailbox/email/attachment_types.rs b/melib/src/mailbox/email/attachment_types.rs index 0e73550b..8712e7d2 100644 --- a/melib/src/mailbox/email/attachment_types.rs +++ b/melib/src/mailbox/email/attachment_types.rs @@ -1,4 +1,6 @@ +use mailbox::email::parser::BytesExt; use std::fmt::{Display, Formatter, Result as FmtResult}; +use std::str; #[derive(Clone, Copy, Debug, PartialEq)] pub enum Charset { @@ -8,10 +10,14 @@ pub enum Charset { ISO8859_1, ISO8859_2, ISO8859_7, + ISO8859_15, + Windows1251, Windows1252, Windows1253, GBK, GB2312, + BIG5, + ISO2022JP, } impl Default for Charset { @@ -23,18 +29,25 @@ impl Default for Charset { impl<'a> From<&'a [u8]> for Charset { fn from(b: &'a [u8]) -> Self { // TODO: Case insensitivity - match b { + match b.trim() { b"us-ascii" | b"ascii" | b"US-ASCII" => Charset::Ascii, b"utf-8" | b"UTF-8" => Charset::UTF8, b"utf-16" | b"UTF-16" => Charset::UTF16, b"iso-8859-1" | b"ISO-8859-1" => Charset::ISO8859_1, b"iso-8859-2" | b"ISO-8859-2" => Charset::ISO8859_2, b"iso-8859-7" | b"ISO-8859-7" => Charset::ISO8859_7, + b"iso-8859-15" | b"ISO-8859-15" => Charset::ISO8859_15, + b"windows-1251" | b"Windows-1251" => Charset::Windows1251, b"windows-1252" | b"Windows-1252" => Charset::Windows1252, b"windows-1253" | b"Windows-1253" => Charset::Windows1253, b"GBK" | b"gbk" => Charset::GBK, b"gb2312" | b"GB2312" => Charset::GB2312, - _ => Charset::Ascii, + b"BIG5" | b"big5" => Charset::BIG5, + b"ISO-2022-JP" | b"iso-2022-JP" => Charset::ISO2022JP, + _ => { + eprintln!("unknown tag is {:?}", str::from_utf8(b)); + Charset::Ascii + }, } } } diff --git a/melib/src/mailbox/email/parser.rs b/melib/src/mailbox/email/parser.rs index cd0e9c12..cd9c2ba7 100644 --- a/melib/src/mailbox/email/parser.rs +++ b/melib/src/mailbox/email/parser.rs @@ -23,10 +23,8 @@ use chrono; use data_encoding::BASE64_MIME; use encoding::{DecoderTrap, Encoding}; use nom::{is_hex_digit, le_u8}; -use nom::{Compare, CompareResult}; use nom::{ErrorKind, IResult, Needed}; use std; -use std::str::from_utf8; macro_rules! is_whitespace { ($var:ident) => { @@ -166,7 +164,7 @@ named!(pub attachment<(std::vec::Vec<(&[u8], &[u8])>, &[u8])>, /* TODO: make a map of encodings and decoding functions so that they can be reused and easily * extended */ -use encoding::all::{ISO_8859_1, ISO_8859_2, ISO_8859_7, WINDOWS_1252, WINDOWS_1253, GBK}; +use encoding::all::*; fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec> { if input.len() < 5 { @@ -174,107 +172,86 @@ fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec> { } else if input[0] != b'=' || input[1] != b'?' { return IResult::Error(error_code!(ErrorKind::Custom(43))); } - for tag in &[ - "UTF-8", - "iso-8859-7", - "windows-1253", - "iso-8859-1", - "iso-8859-2", - "gbk", - ] { - if let CompareResult::Ok = (&input[2..]).compare_no_case(*tag) { - let tag_len = tag.len(); - /* tag must end with ?_? where _ is either Q or B, eg: =?UTF-8?B? */ - if input[2 + tag_len] != b'?' || input[2 + tag_len + 2] != b'?' { - return IResult::Error(error_code!(ErrorKind::Custom(43))); - } - /* See if input ends with "?=" and get ending index */ - let mut encoded_idx = None; - for i in (5 + tag_len)..input.len() { - if input[i] == b'?' && i < input.len() && input[i + 1] == b'=' { - encoded_idx = Some(i); - break; - } - } - if encoded_idx.is_none() { - return IResult::Error(error_code!(ErrorKind::Custom(43))); - } - let encoded = &input[5 + tag_len..encoded_idx.unwrap()]; - - let s: Vec = match input[2 + tag_len + 1] { - b'b' | b'B' => match BASE64_MIME.decode(encoded) { - Ok(v) => v, - Err(_) => encoded.to_vec(), - }, - b'q' | b'Q' => match quoted_printable_bytes_header(encoded) { - IResult::Done(b"", s) => s, - _ => return IResult::Error(error_code!(ErrorKind::Custom(43))), - }, - _ => return IResult::Error(error_code!(ErrorKind::Custom(43))), - }; - - match *tag { - "UTF-8" => { - return IResult::Done(&input[encoded_idx.unwrap() + 2..], s); - } - "iso-8859-7" => { - return if let Ok(v) = ISO_8859_7.decode(&s, DecoderTrap::Strict) { - IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes()) - } else { - IResult::Error(error_code!(ErrorKind::Custom(43))) - } - } - "windows-1253" => { - return if let Ok(v) = WINDOWS_1253.decode(&s, DecoderTrap::Strict) { - IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes()) - } else { - IResult::Error(error_code!(ErrorKind::Custom(43))) - } - } - "iso-8859-1" => { - return if let Ok(v) = ISO_8859_1.decode(&s, DecoderTrap::Strict) { - IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes()) - } else { - IResult::Error(error_code!(ErrorKind::Custom(43))) - } - } - "iso-8859-2" => { - return if let Ok(v) = ISO_8859_2.decode(&s, DecoderTrap::Strict) { - IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes()) - } else { - IResult::Error(error_code!(ErrorKind::Custom(43))) - } - } - "gbk" => { - return if let Ok(v) = GBK.decode(&s, DecoderTrap::Strict) { - IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes()) - } else { - IResult::Error(error_code!(ErrorKind::Custom(43))) - } - } - _ => { - panic!(); - } - } - } else { - continue; + /* find end of Charset tag: + * =?charset?encoding?encoded text?= + * ---------^ + */ + let mut tag_end_idx = None; + for (idx, b) in input[2..].iter().enumerate() { + if *b == b'?' { + tag_end_idx = Some(idx + 2); + break; + } + } + if tag_end_idx.is_none() { + return IResult::Error(error_code!(ErrorKind::Custom(43))); + } + let tag_end_idx = tag_end_idx.unwrap(); + + + if input[2 + tag_end_idx] != b'?' { + return IResult::Error(error_code!(ErrorKind::Custom(43))); + } + /* See if input ends with "?=" and get ending index + * =?charset?encoding?encoded text?= + * -------------------------------^ + */ + let mut encoded_end_idx = None; + for i in (3 + tag_end_idx)..input.len() { + if input[i] == b'?' && i < input.len() && input[i + 1] == b'=' { + encoded_end_idx = Some(i); + break; + } + } + if encoded_end_idx.is_none() { + return IResult::Error(error_code!(ErrorKind::Custom(43))); + } + let encoded_end_idx = encoded_end_idx.unwrap(); + let encoded_text = &input[3 + tag_end_idx..encoded_end_idx]; + + let s: Vec = match input[tag_end_idx + 1] { + b'b' | b'B' => match BASE64_MIME.decode(encoded_text) { + Ok(v) => v, + Err(_) => encoded_text.to_vec(), + }, + b'q' | b'Q' => match quoted_printable_bytes_header(encoded_text) { + IResult::Done(b"", s) => s, + _ => return IResult::Error(error_code!(ErrorKind::Custom(43))), + }, + _ => return IResult::Error(error_code!(ErrorKind::Custom(43))), + }; + + + let charset = Charset::from(&input[2..tag_end_idx]); + + if let Charset::UTF8 = charset { + IResult::Done(&input[encoded_end_idx + 2..], s) + } else { + match decode_charset(&s, charset) { + Ok(v) => { + IResult::Done(&input[encoded_end_idx + 2..], v.into_bytes()) + }, + _ => IResult::Error(error_code!(ErrorKind::Custom(43))), } } - eprintln!("unknown tag is {:?}", from_utf8(&input[2..20])); - IResult::Error(error_code!(ErrorKind::Custom(43))) } pub fn decode_charset(s: &[u8], charset: Charset) -> Result { match charset { - Charset::UTF8 | Charset::Ascii => Ok(String::from_utf8(s.to_vec()).unwrap()), - Charset::ISO8859_7 => Ok(ISO_8859_7.decode(s, DecoderTrap::Strict)?), + Charset::UTF8 | Charset::Ascii => Ok(String::from_utf8_lossy(s).to_string()), Charset::ISO8859_1 => Ok(ISO_8859_1.decode(s, DecoderTrap::Strict)?), Charset::ISO8859_2 => Ok(ISO_8859_2.decode(s, DecoderTrap::Strict)?), + Charset::ISO8859_7 => Ok(ISO_8859_7.decode(s, DecoderTrap::Strict)?), + Charset::ISO8859_15 => Ok(ISO_8859_15.decode(s, DecoderTrap::Strict)?), Charset::GBK => Ok(GBK.decode(s, DecoderTrap::Strict)?), + Charset::Windows1251 => Ok(WINDOWS_1251.decode(s, DecoderTrap::Strict)?), Charset::Windows1252 => Ok(WINDOWS_1252.decode(s, DecoderTrap::Strict)?), Charset::Windows1253 => Ok(WINDOWS_1253.decode(s, DecoderTrap::Strict)?), - Charset::GB2312 => unimplemented!(), - Charset::UTF16 => unimplemented!(), + // Unimplemented: + Charset::GB2312 => Ok(String::from_utf8_lossy(s).to_string()), + Charset::UTF16 => Ok(String::from_utf8_lossy(s).to_string()), + Charset::BIG5 => Ok(String::from_utf8_lossy(s).to_string()), + Charset::ISO2022JP => Ok(String::from_utf8_lossy(s).to_string()), } }