Add more charsets and parsing stuff

embed
Manos Pitsidianakis 2018-08-11 22:47:27 +03:00
parent 1b44aae5ce
commit d8137738b5
Signed by: Manos Pitsidianakis
GPG Key ID: 73627C2F690DF710
3 changed files with 93 additions and 95 deletions

View File

@ -28,6 +28,7 @@ use std::error::Error;
use std::fmt; use std::fmt;
use std::io; use std::io;
use std::result; use std::result;
use std::string;
use nom; use nom;
@ -82,6 +83,13 @@ impl<'a> From<Cow<'a, str>> for MeliError {
} }
} }
impl From<string::FromUtf8Error> for MeliError {
#[inline]
fn from(kind: string::FromUtf8Error) -> MeliError {
MeliError::new(format!("{:?}", kind))
}
}
//use std::option; //use std::option;
//impl From<option::NoneError> for MeliError { //impl From<option::NoneError> for MeliError {
// #[inline] // #[inline]

View File

@ -1,4 +1,6 @@
use mailbox::email::parser::BytesExt;
use std::fmt::{Display, Formatter, Result as FmtResult}; use std::fmt::{Display, Formatter, Result as FmtResult};
use std::str;
#[derive(Clone, Copy, Debug, PartialEq)] #[derive(Clone, Copy, Debug, PartialEq)]
pub enum Charset { pub enum Charset {
@ -8,10 +10,14 @@ pub enum Charset {
ISO8859_1, ISO8859_1,
ISO8859_2, ISO8859_2,
ISO8859_7, ISO8859_7,
ISO8859_15,
Windows1251,
Windows1252, Windows1252,
Windows1253, Windows1253,
GBK, GBK,
GB2312, GB2312,
BIG5,
ISO2022JP,
} }
impl Default for Charset { impl Default for Charset {
@ -23,18 +29,25 @@ impl Default for Charset {
impl<'a> From<&'a [u8]> for Charset { impl<'a> From<&'a [u8]> for Charset {
fn from(b: &'a [u8]) -> Self { fn from(b: &'a [u8]) -> Self {
// TODO: Case insensitivity // TODO: Case insensitivity
match b { match b.trim() {
b"us-ascii" | b"ascii" | b"US-ASCII" => Charset::Ascii, b"us-ascii" | b"ascii" | b"US-ASCII" => Charset::Ascii,
b"utf-8" | b"UTF-8" => Charset::UTF8, b"utf-8" | b"UTF-8" => Charset::UTF8,
b"utf-16" | b"UTF-16" => Charset::UTF16, b"utf-16" | b"UTF-16" => Charset::UTF16,
b"iso-8859-1" | b"ISO-8859-1" => Charset::ISO8859_1, b"iso-8859-1" | b"ISO-8859-1" => Charset::ISO8859_1,
b"iso-8859-2" | b"ISO-8859-2" => Charset::ISO8859_2, b"iso-8859-2" | b"ISO-8859-2" => Charset::ISO8859_2,
b"iso-8859-7" | b"ISO-8859-7" => Charset::ISO8859_7, b"iso-8859-7" | b"ISO-8859-7" => Charset::ISO8859_7,
b"iso-8859-15" | b"ISO-8859-15" => Charset::ISO8859_15,
b"windows-1251" | b"Windows-1251" => Charset::Windows1251,
b"windows-1252" | b"Windows-1252" => Charset::Windows1252, b"windows-1252" | b"Windows-1252" => Charset::Windows1252,
b"windows-1253" | b"Windows-1253" => Charset::Windows1253, b"windows-1253" | b"Windows-1253" => Charset::Windows1253,
b"GBK" | b"gbk" => Charset::GBK, b"GBK" | b"gbk" => Charset::GBK,
b"gb2312" | b"GB2312" => Charset::GB2312, b"gb2312" | b"GB2312" => Charset::GB2312,
_ => Charset::Ascii, b"BIG5" | b"big5" => Charset::BIG5,
b"ISO-2022-JP" | b"iso-2022-JP" => Charset::ISO2022JP,
_ => {
eprintln!("unknown tag is {:?}", str::from_utf8(b));
Charset::Ascii
},
} }
} }
} }

View File

@ -23,10 +23,8 @@ use chrono;
use data_encoding::BASE64_MIME; use data_encoding::BASE64_MIME;
use encoding::{DecoderTrap, Encoding}; use encoding::{DecoderTrap, Encoding};
use nom::{is_hex_digit, le_u8}; use nom::{is_hex_digit, le_u8};
use nom::{Compare, CompareResult};
use nom::{ErrorKind, IResult, Needed}; use nom::{ErrorKind, IResult, Needed};
use std; use std;
use std::str::from_utf8;
macro_rules! is_whitespace { macro_rules! is_whitespace {
($var:ident) => { ($var:ident) => {
@ -166,7 +164,7 @@ named!(pub attachment<(std::vec::Vec<(&[u8], &[u8])>, &[u8])>,
/* TODO: make a map of encodings and decoding functions so that they can be reused and easily /* TODO: make a map of encodings and decoding functions so that they can be reused and easily
* extended */ * extended */
use encoding::all::{ISO_8859_1, ISO_8859_2, ISO_8859_7, WINDOWS_1252, WINDOWS_1253, GBK}; use encoding::all::*;
fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec<u8>> { fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
if input.len() < 5 { if input.len() < 5 {
@ -174,107 +172,86 @@ fn encoded_word(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
} else if input[0] != b'=' || input[1] != b'?' { } else if input[0] != b'=' || input[1] != b'?' {
return IResult::Error(error_code!(ErrorKind::Custom(43))); return IResult::Error(error_code!(ErrorKind::Custom(43)));
} }
for tag in &[ /* find end of Charset tag:
"UTF-8", * =?charset?encoding?encoded text?=
"iso-8859-7", * ---------^
"windows-1253", */
"iso-8859-1", let mut tag_end_idx = None;
"iso-8859-2", for (idx, b) in input[2..].iter().enumerate() {
"gbk", if *b == b'?' {
] { tag_end_idx = Some(idx + 2);
if let CompareResult::Ok = (&input[2..]).compare_no_case(*tag) { break;
let tag_len = tag.len(); }
/* tag must end with ?_? where _ is either Q or B, eg: =?UTF-8?B? */ }
if input[2 + tag_len] != b'?' || input[2 + tag_len + 2] != b'?' { if tag_end_idx.is_none() {
return IResult::Error(error_code!(ErrorKind::Custom(43))); return IResult::Error(error_code!(ErrorKind::Custom(43)));
} }
/* See if input ends with "?=" and get ending index */ let tag_end_idx = tag_end_idx.unwrap();
let mut encoded_idx = None;
for i in (5 + tag_len)..input.len() {
if input[i] == b'?' && i < input.len() && input[i + 1] == b'=' { if input[2 + tag_end_idx] != b'?' {
encoded_idx = Some(i); return IResult::Error(error_code!(ErrorKind::Custom(43)));
break; }
} /* See if input ends with "?=" and get ending index
} * =?charset?encoding?encoded text?=
if encoded_idx.is_none() { * -------------------------------^
return IResult::Error(error_code!(ErrorKind::Custom(43))); */
} let mut encoded_end_idx = None;
let encoded = &input[5 + tag_len..encoded_idx.unwrap()]; for i in (3 + tag_end_idx)..input.len() {
if input[i] == b'?' && i < input.len() && input[i + 1] == b'=' {
let s: Vec<u8> = match input[2 + tag_len + 1] { encoded_end_idx = Some(i);
b'b' | b'B' => match BASE64_MIME.decode(encoded) { break;
Ok(v) => v, }
Err(_) => encoded.to_vec(), }
}, if encoded_end_idx.is_none() {
b'q' | b'Q' => match quoted_printable_bytes_header(encoded) { return IResult::Error(error_code!(ErrorKind::Custom(43)));
IResult::Done(b"", s) => s, }
_ => return IResult::Error(error_code!(ErrorKind::Custom(43))), let encoded_end_idx = encoded_end_idx.unwrap();
}, let encoded_text = &input[3 + tag_end_idx..encoded_end_idx];
_ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
}; let s: Vec<u8> = match input[tag_end_idx + 1] {
b'b' | b'B' => match BASE64_MIME.decode(encoded_text) {
match *tag { Ok(v) => v,
"UTF-8" => { Err(_) => encoded_text.to_vec(),
return IResult::Done(&input[encoded_idx.unwrap() + 2..], s); },
} b'q' | b'Q' => match quoted_printable_bytes_header(encoded_text) {
"iso-8859-7" => { IResult::Done(b"", s) => s,
return if let Ok(v) = ISO_8859_7.decode(&s, DecoderTrap::Strict) { _ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes()) },
} else { _ => return IResult::Error(error_code!(ErrorKind::Custom(43))),
IResult::Error(error_code!(ErrorKind::Custom(43))) };
}
}
"windows-1253" => { let charset = Charset::from(&input[2..tag_end_idx]);
return if let Ok(v) = WINDOWS_1253.decode(&s, DecoderTrap::Strict) {
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes()) if let Charset::UTF8 = charset {
} else { IResult::Done(&input[encoded_end_idx + 2..], s)
IResult::Error(error_code!(ErrorKind::Custom(43))) } else {
} match decode_charset(&s, charset) {
} Ok(v) => {
"iso-8859-1" => { IResult::Done(&input[encoded_end_idx + 2..], v.into_bytes())
return if let Ok(v) = ISO_8859_1.decode(&s, DecoderTrap::Strict) { },
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes()) _ => IResult::Error(error_code!(ErrorKind::Custom(43))),
} else {
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
}
"iso-8859-2" => {
return if let Ok(v) = ISO_8859_2.decode(&s, DecoderTrap::Strict) {
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
} else {
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
}
"gbk" => {
return if let Ok(v) = GBK.decode(&s, DecoderTrap::Strict) {
IResult::Done(&input[encoded_idx.unwrap() + 2..], v.into_bytes())
} else {
IResult::Error(error_code!(ErrorKind::Custom(43)))
}
}
_ => {
panic!();
}
}
} else {
continue;
} }
} }
eprintln!("unknown tag is {:?}", from_utf8(&input[2..20]));
IResult::Error(error_code!(ErrorKind::Custom(43)))
} }
pub fn decode_charset(s: &[u8], charset: Charset) -> Result<String> { pub fn decode_charset(s: &[u8], charset: Charset) -> Result<String> {
match charset { match charset {
Charset::UTF8 | Charset::Ascii => Ok(String::from_utf8(s.to_vec()).unwrap()), Charset::UTF8 | Charset::Ascii => Ok(String::from_utf8_lossy(s).to_string()),
Charset::ISO8859_7 => Ok(ISO_8859_7.decode(s, DecoderTrap::Strict)?),
Charset::ISO8859_1 => Ok(ISO_8859_1.decode(s, DecoderTrap::Strict)?), Charset::ISO8859_1 => Ok(ISO_8859_1.decode(s, DecoderTrap::Strict)?),
Charset::ISO8859_2 => Ok(ISO_8859_2.decode(s, DecoderTrap::Strict)?), Charset::ISO8859_2 => Ok(ISO_8859_2.decode(s, DecoderTrap::Strict)?),
Charset::ISO8859_7 => Ok(ISO_8859_7.decode(s, DecoderTrap::Strict)?),
Charset::ISO8859_15 => Ok(ISO_8859_15.decode(s, DecoderTrap::Strict)?),
Charset::GBK => Ok(GBK.decode(s, DecoderTrap::Strict)?), Charset::GBK => Ok(GBK.decode(s, DecoderTrap::Strict)?),
Charset::Windows1251 => Ok(WINDOWS_1251.decode(s, DecoderTrap::Strict)?),
Charset::Windows1252 => Ok(WINDOWS_1252.decode(s, DecoderTrap::Strict)?), Charset::Windows1252 => Ok(WINDOWS_1252.decode(s, DecoderTrap::Strict)?),
Charset::Windows1253 => Ok(WINDOWS_1253.decode(s, DecoderTrap::Strict)?), Charset::Windows1253 => Ok(WINDOWS_1253.decode(s, DecoderTrap::Strict)?),
Charset::GB2312 => unimplemented!(), // Unimplemented:
Charset::UTF16 => unimplemented!(), Charset::GB2312 => Ok(String::from_utf8_lossy(s).to_string()),
Charset::UTF16 => Ok(String::from_utf8_lossy(s).to_string()),
Charset::BIG5 => Ok(String::from_utf8_lossy(s).to_string()),
Charset::ISO2022JP => Ok(String::from_utf8_lossy(s).to_string()),
} }
} }