Add more charsets and parsing stuff

embed
Manos Pitsidianakis 2018-08-11 22:47:27 +03:00
parent 1b44aae5ce
commit d8137738b5
Signed by: Manos Pitsidianakis
GPG Key ID: 73627C2F690DF710
3 changed files with 93 additions and 95 deletions

View File

@ -28,6 +28,7 @@ use std::error::Error;
use std::fmt;
use std::io;
use std::result;
use std::string;
use nom;
@ -82,6 +83,13 @@ impl<'a> From<Cow<'a, str>> for MeliError {
}
}
impl From<string::FromUtf8Error> for MeliError {
#[inline]
fn from(kind: string::FromUtf8Error) -> MeliError {
MeliError::new(format!("{:?}", kind))
}
}
//use std::option;
//impl From<option::NoneError> for MeliError {
// #[inline]

View File

@ -1,4 +1,6 @@
use mailbox::email::parser::BytesExt;
use std::fmt::{Display, Formatter, Result as FmtResult};
use std::str;
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Charset {
@ -8,10 +10,14 @@ pub enum Charset {
ISO8859_1,
ISO8859_2,
ISO8859_7,
ISO8859_15,
Windows1251,
Windows1252,
Windows1253,
GBK,
GB2312,
BIG5,
ISO2022JP,
}
impl Default for Charset {
@ -23,18 +29,25 @@ impl Default for Charset {
impl<'a> From<&'a [u8]> for Charset {
fn from(b: &'a [u8]) -> Self {
// TODO: Case insensitivity
match b {
match b.trim() {
b"us-ascii" | b"ascii" | b"US-ASCII" => Charset::Ascii,
b"utf-8" | b"UTF-8" => Charset::UTF8,
b"utf-16" | b"UTF-16" => Charset::UTF16,
b"iso-8859-1" | b"ISO-8859-1" => Charset::ISO8859_1,
b"iso-8859-2" | b"ISO-8859-2" => Charset::ISO8859_2,
b"iso-8859-7" | b"ISO-8859-7" => Charset::ISO8859_7,
b"iso-8859-15" | b"ISO-8859-15" => Charset::ISO8859_15,
b"windows-1251" | b"Windows-1251" => Charset::Windows1251,
b"windows-1252" | b"Windows-1252" => Charset::Windows1252,
b"windows-1253" | b"Windows-1253" => Charset::Windows1253,
b"GBK" | b"gbk" => Charset::GBK,
b"gb2312" | b"GB2312" => Charset::GB2312,
_ => Charset::Ascii,
b"BIG5" | b"big5" => Charset::BIG5,
b"ISO-2022-JP" | b"iso-2022-JP" => Charset::ISO2022JP,
_ => {
eprintln!("unknown tag is {:?}", str::from_utf8(b));
Charset::Ascii
},
}
}
}

View File

@ -23,10 +23,8 @@ use chrono;
use data_encoding::BASE64_MIME;
use encoding::{DecoderTrap, Encoding};
use nom::{is_hex_digit, le_u8};
use nom::{Compare, CompareResult};
use nom::{ErrorKind, IResult, Needed};
use std;
use std::str::from_utf8;
macro_rules! is_whitespace {
($var:ident) => {
@ -166,7 +164,7 @@ named!(pub attachment<(std::vec::Vec<(&[u8], &[u8])>, &[u8])>,
/* TODO: make a map of encodings and decoding functions so that they can be reused and easily
* extended */
use encoding::all::{ISO_8859_1, ISO_8859_2, ISO_8859_7, WINDOWS_1252, WINDOWS_1253, GBK};
use encoding::all::*;
fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
if input.len() < 5 {
@ -174,107 +172,86 @@ fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
} else if input[0] != b'=' || input[1] != b'?' {
return IResult::Error(error_code!(ErrorKind::Custom(43)));
}
for tag in &[
"UTF-8",
"iso-8859-7",
"windows-1253",
"iso-8859-1",
"iso-8859-2",
"gbk",
] {
if let CompareResult::Ok = (&input[2..]).compare_no_case(*tag) {
let tag_len = tag.len();
/* tag must end with ?_? where _ is either Q or B, eg: =?UTF-8?B? */
if input[2 + tag_len] != b'?' || input[2 + tag_len + 2] != b'?' {
return IResult::Error(error_code!(ErrorKind::Custom(43)));
}
/* See if input ends with "?=" and get ending index */
let mut encoded_idx = None;
for i in (5 + tag_len)..input.len() {
if input[i] == b'?' && i < input.len() && input[i + 1] == b'=' {
encoded_idx = Some(i);
break;
}
}
if encoded_idx.is_none() {
return IResult::Error(error_code!(ErrorKind::Custom(43)));
}
let encoded = &input[5 + tag_len..encoded_idx.unwrap()];
let s: Vec<u8> = match input[2 + tag_len + 1] {
b'b' | b'B' => match BASE64_MIME.decode(encoded) {
Ok(v) => v,
Err(_) => encoded.to_vec(),
},
b'q' | b'Q' => match quoted_printable_bytes_header(encoded) {
IResult::Done(b"", s) => s,
_ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
},
_ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
};
match *tag {
"UTF-8" => {
return IResult::Done(&input[encoded_idx.unwrap() + 2..], s);
}
"iso-8859-7" => {
return if let Ok(v) = ISO_8859_7.decode(&s, DecoderTrap::Strict) {
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
} else {
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
}
"windows-1253" => {
return if let Ok(v) = WINDOWS_1253.decode(&s, DecoderTrap::Strict) {
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
} else {
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
}
"iso-8859-1" => {
return if let Ok(v) = ISO_8859_1.decode(&s, DecoderTrap::Strict) {
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
} else {
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
}
"iso-8859-2" => {
return if let Ok(v) = ISO_8859_2.decode(&s, DecoderTrap::Strict) {
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
} else {
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
}
"gbk" => {
return if let Ok(v) = GBK.decode(&s, DecoderTrap::Strict) {
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
} else {
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
}
_ => {
panic!();
}
}
} else {
continue;
/* find end of Charset tag:
* =?charset?encoding?encoded text?=
* ---------^
*/
let mut tag_end_idx = None;
for (idx, b) in input[2..].iter().enumerate() {
if *b == b'?' {
tag_end_idx = Some(idx + 2);
break;
}
}
if tag_end_idx.is_none() {
return IResult::Error(error_code!(ErrorKind::Custom(43)));
}
let tag_end_idx = tag_end_idx.unwrap();
if input[2 + tag_end_idx] != b'?' {
return IResult::Error(error_code!(ErrorKind::Custom(43)));
}
/* See if input ends with "?=" and get ending index
* =?charset?encoding?encoded text?=
* -------------------------------^
*/
let mut encoded_end_idx = None;
for i in (3 + tag_end_idx)..input.len() {
if input[i] == b'?' && i < input.len() && input[i + 1] == b'=' {
encoded_end_idx = Some(i);
break;
}
}
if encoded_end_idx.is_none() {
return IResult::Error(error_code!(ErrorKind::Custom(43)));
}
let encoded_end_idx = encoded_end_idx.unwrap();
let encoded_text = &input[3 + tag_end_idx..encoded_end_idx];
let s: Vec<u8> = match input[tag_end_idx + 1] {
b'b' | b'B' => match BASE64_MIME.decode(encoded_text) {
Ok(v) => v,
Err(_) => encoded_text.to_vec(),
},
b'q' | b'Q' => match quoted_printable_bytes_header(encoded_text) {
IResult::Done(b"", s) => s,
_ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
},
_ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
};
let charset = Charset::from(&input[2..tag_end_idx]);
if let Charset::UTF8 = charset {
IResult::Done(&input[encoded_end_idx + 2..], s)
} else {
match decode_charset(&s, charset) {
Ok(v) => {
IResult::Done(&input[encoded_end_idx + 2..], v.into_bytes())
},
_ => IResult::Error(error_code!(ErrorKind::Custom(43))),
}
}
eprintln!("unknown tag is {:?}", from_utf8(&input[2..20]));
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
pub fn decode_charset(s: &[u8], charset: Charset) -> Result<String> {
match charset {
Charset::UTF8 | Charset::Ascii => Ok(String::from_utf8(s.to_vec()).unwrap()),
Charset::ISO8859_7 => Ok(ISO_8859_7.decode(s, DecoderTrap::Strict)?),
Charset::UTF8 | Charset::Ascii => Ok(String::from_utf8_lossy(s).to_string()),
Charset::ISO8859_1 => Ok(ISO_8859_1.decode(s, DecoderTrap::Strict)?),
Charset::ISO8859_2 => Ok(ISO_8859_2.decode(s, DecoderTrap::Strict)?),
Charset::ISO8859_7 => Ok(ISO_8859_7.decode(s, DecoderTrap::Strict)?),
Charset::ISO8859_15 => Ok(ISO_8859_15.decode(s, DecoderTrap::Strict)?),
Charset::GBK => Ok(GBK.decode(s, DecoderTrap::Strict)?),
Charset::Windows1251 => Ok(WINDOWS_1251.decode(s, DecoderTrap::Strict)?),
Charset::Windows1252 => Ok(WINDOWS_1252.decode(s, DecoderTrap::Strict)?),
Charset::Windows1253 => Ok(WINDOWS_1253.decode(s, DecoderTrap::Strict)?),
Charset::GB2312 => unimplemented!(),
Charset::UTF16 => unimplemented!(),
// Unimplemented:
Charset::GB2312 => Ok(String::from_utf8_lossy(s).to_string()),
Charset::UTF16 => Ok(String::from_utf8_lossy(s).to_string()),
Charset::BIG5 => Ok(String::from_utf8_lossy(s).to_string()),
Charset::ISO2022JP => Ok(String::from_utf8_lossy(s).to_string()),
}
}