Make unicode-segmentation a hard dependency
Run cargo lints / Lint on ${{ matrix.build }} (linux-amd64, ubuntu-latest, stable, x86_64-unknown-linux-gnu) (pull_request) Successful in 7m48s Details
Run Tests / Test on ${{ matrix.build }} (linux-amd64, ubuntu-latest, stable, x86_64-unknown-linux-gnu) (pull_request) Successful in 15m42s Details
Cargo manifest lints / Lint Cargo manifests on ${{ matrix.build }} (linux-amd64, ubuntu-latest, stable, x86_64-unknown-linux-gnu) (pull_request) Successful in 11m5s Details

meli/melib are UTF8 software, so we should have proper Unicode support.

A compile-time env var is added, `UNICODE_REGENERATE_TABLES` to force
network access and rebuild the cached unicode tables.

Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is>
pull/377/head
Manos Pitsidianakis 2024-04-11 21:15:47 +03:00
parent 07072e2e3f
commit ae96038fbf
Signed by: Manos Pitsidianakis
GPG Key ID: 7729C7707F7E09D0
13 changed files with 16 additions and 147 deletions

View File

@ -3,7 +3,7 @@
For a quick start, build and install locally:
```sh
PREFIX=~/.local make install
PREFIX=~/.local make install
```
Available subcommands for `make` are listed with `make help`.
@ -34,6 +34,9 @@ Some functionality is held behind "feature gates", or compile-time flags. The fo
Since it's actual use in the code is very limited, it is not recommended to use this (off by default).
- `static` and `*-static` bundle C libraries in dependencies so that you don't need them installed in your system (on by default).
Though not a feature, the presence of the environment variable `UNICODE_REGENERATE_TABLES` in compile-time of the `melib` crate will force the regeneration of unicode tables.
Otherwise the tables are included with the source code, and there's no real reason to regenerate them unless you intend to modify the code or update to a new Unicode version.
## Build Debian package (*deb*)
Building with Debian's packaged cargo might require the installation of these two packages: `librust-openssl-sys-dev librust-libdbus-sys-dev`

View File

@ -14,10 +14,7 @@ path = "fuzz_targets/envelope_parse.rs"
[dependencies]
libfuzzer-sys = "0.3"
[dependencies.melib]
path = "../melib"
features = ["unicode-algorithms"]
melib = { path = "../melib" }
# Prevent this from interfering with workspaces
[workspace]

View File

@ -31,7 +31,7 @@ indexmap = { version = "^1.6", features = ["serde-1"] }
libc = { version = "0.2.125", default-features = false, features = ["extra_traits"] }
libz-sys = { version = "1.1", features = ["static"], optional = true }
linkify = { version = "^0.8", default-features = false }
melib = { path = "../melib", version = "0.8.5-rc.3", features = ["unicode-algorithms"] }
melib = { path = "../melib", version = "0.8.5-rc.3", features = [] }
nix = { version = "0.27", default-features = false, features = ["signal", "poll", "term", "ioctl", "process"] }
notify = { version = "4.0.1", default-features = false } # >:c
num_cpus = "1.12.0"

View File

@ -50,7 +50,7 @@ serde_path_to_error = { version = "0.1" }
smallvec = { version = "^1.5.0", features = ["serde"] }
smol = "1.0.0"
socket2 = { version = "0.5", features = [] }
unicode-segmentation = { version = "1.2.1", default-features = false, optional = true }
unicode-segmentation = { version = "1.2.1", default-features = false }
url = { version = "2.4", optional = true }
uuid = { version = "^1", features = ["serde", "v4", "v5"] }
xdg = "2.1.0"
@ -77,9 +77,6 @@ sqlite3 = ["rusqlite"]
sqlite3-static = ["sqlite3", "rusqlite/bundled-full"]
tls = ["native-tls"]
tls-static = ["tls", "native-tls/vendored"]
text-processing = []
unicode-algorithms = ["text-processing", "unicode-segmentation"]
unicode-algorithms-cached = ["text-processing", "unicode-segmentation"]
vcard = []
[build-dependencies]

View File

@ -22,24 +22,6 @@ Library for handling mail.
|------------------------------|-------------------------------------|--------------------------|
| `sqlite` | `rusqlite` | Used in IMAP cache. |
|------------------------------|-------------------------------------|--------------------------|
| `unicode-algorithms` | `unicode-segmentation` | Linebreaking algo etc |
| | | For a fresh clean build, |
| | | Network access is |
| | | required to fetch data |
| | | from Unicode's website. |
|------------------------------|-------------------------------------|--------------------------|
| `unicode-algorithms-cached` | `unicode-segmentation` | Linebreaking algo etc |
| | | but it uses a cached |
| | | version of Unicode data |
| | | which might be stale. |
| | | |
| | | Use this feature instead |
| | | of the previous one for |
| | | building without network |
| | | access. |
|------------------------------|-------------------------------------|--------------------------|
| `unicode-algorithms` | `unicode-segmentation` | |
|------------------------------|-------------------------------------|--------------------------|
| `vcard` | | vcard parsing |
|------------------------------|-------------------------------------|--------------------------|
| `gpgme` | | GPG use with libgpgme |

View File

@ -21,15 +21,14 @@
#![allow(clippy::needless_range_loop)]
#[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
include!("src/text/types.rs");
fn main() -> Result<(), std::io::Error> {
#[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
{
const MOD_PATH: &str = "src/text/tables.rs";
println!("cargo:rerun-if-env-changed=UNICODE_REGENERATE_TABLES");
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed={}", MOD_PATH);
println!("cargo:rerun-if-changed={MOD_PATH}");
/* Line break tables */
use std::{
fs::File,
@ -54,7 +53,7 @@ fn main() -> Result<(), std::io::Error> {
);
return Ok(());
}
if cfg!(feature = "unicode-algorithms-cached") {
if std::env::var("UNICODE_REGENERATE_TABLES").is_err() {
const CACHED_MODULE: &[u8] = include_bytes!(concat!("./src/text/tables.rs.gz"));
let mut gz = GzDecoder::new(CACHED_MODULE);

View File

@ -20,14 +20,12 @@
*/
use super::*;
#[cfg(feature = "text-processing")]
use crate::text::grapheme_clusters::TextProcessing;
pub fn encode_header(value: &str) -> String {
let mut ret = String::with_capacity(value.len());
let mut is_current_window_ascii = true;
let mut current_window_start = 0;
#[cfg(feature = "text-processing")]
{
let graphemes = value.graphemes_indices();
for (idx, g) in graphemes {
@ -81,63 +79,6 @@ pub fn encode_header(value: &str) -> String {
}
}
}
#[cfg(not(feature = "text-processing"))]
{
/* [ref:VERIFY] [ref:TODO]: test this. If it works as fine as the one above, there's no need to
* keep the above implementation. */
for (i, g) in value.char_indices() {
match (g.is_ascii(), is_current_window_ascii) {
(true, true) => {
ret.push(g);
}
(true, false) => {
/* If !g.is_whitespace()
*
* Whitespaces inside encoded tokens must be greedily taken,
* instead of splitting each non-ascii word into separate encoded tokens. */
if !g.is_whitespace() && value.is_char_boundary(i) {
ret.push_str(&format!(
"=?UTF-8?B?{}?=",
BASE64_MIME
.encode(value[current_window_start..i].as_bytes())
.trim()
));
if i != value.len() - 1 {
ret.push(' ');
}
is_current_window_ascii = true;
current_window_start = i;
ret.push(g);
}
}
(false, true) => {
current_window_start = i;
is_current_window_ascii = false;
}
/* RFC2047 recommends:
* 'While there is no limit to the length of a multiple-line header field, each
* line of a header field that contains one or more
* 'encoded-word's is limited to 76 characters.'
* This is a rough compliance.
*/
(false, false)
if value.is_char_boundary(i) && value[current_window_start..i].len() > 76 =>
{
ret.push_str(&format!(
"=?UTF-8?B?{}?=",
BASE64_MIME
.encode(value[current_window_start..i].as_bytes())
.trim()
));
if i != value.len() - 1 {
ret.push(' ');
}
current_window_start = i;
}
(false, false) => {}
}
}
}
/* If the last part of the header value is encoded, it won't be pushed inside
* the previous for block */
if !is_current_window_ascii {

View File

@ -132,7 +132,6 @@ pub mod dbg {
}
}
#[cfg(feature = "text-processing")]
pub mod text;
pub use utils::{

View File

@ -29,12 +29,12 @@
*/
use unicode_segmentation::UnicodeSegmentation;
use super::{
types::Reflow,
wcwidth::{wcwidth, CodePointsIter},
};
extern crate unicode_segmentation;
use self::unicode_segmentation::UnicodeSegmentation;
pub trait TextProcessing: UnicodeSegmentation + CodePointsIter {
fn split_graphemes(&self) -> Vec<&str> {

View File

@ -19,12 +19,11 @@
* along with meli. If not, see <http://www.gnu.org/licenses/>.
*/
extern crate unicode_segmentation;
use std::{cmp::Ordering, collections::VecDeque, iter::Peekable, str::FromStr};
use unicode_segmentation::UnicodeSegmentation;
use LineBreakClass::*;
use self::unicode_segmentation::UnicodeSegmentation;
use super::{
grapheme_clusters::TextProcessing,
tables::LINE_BREAK_RULES,

View File

@ -19,6 +19,8 @@
* along with meli. If not, see <http://www.gnu.org/licenses/>.
*/
use unicode_segmentation::UnicodeSegmentation;
pub mod grapheme_clusters;
pub mod line_break;
pub mod search;
@ -43,8 +45,6 @@ impl Truncate for &str {
return;
}
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
.take(new_len)
.last()
@ -58,8 +58,6 @@ impl Truncate for &str {
return self;
}
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
.take(new_len)
.last()
@ -75,8 +73,6 @@ impl Truncate for &str {
return "";
}
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
&self[first..]
} else {
@ -90,8 +86,6 @@ impl Truncate for &str {
return;
}
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
*self = &self[first..];
}
@ -104,8 +98,6 @@ impl Truncate for String {
return;
}
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
.take(new_len)
.last()
@ -119,8 +111,6 @@ impl Truncate for String {
return self;
}
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
.take(new_len)
.last()
@ -136,8 +126,6 @@ impl Truncate for String {
return "";
}
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((first, _)) =
UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
{
@ -153,8 +141,6 @@ impl Truncate for String {
return;
}
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((first, _)) =
UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
{

View File

@ -52,7 +52,6 @@ pub use iterators::*;
use smallvec::SmallVec;
use uuid::Uuid;
#[cfg(feature = "text-processing")]
use crate::text::grapheme_clusters::*;
type Envelopes = Arc<RwLock<HashMap<EnvelopeHash, Envelope>>>;
@ -1223,16 +1222,11 @@ impl Threads {
}
let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{
ma.subject()
.split_graphemes()
.cmp(&mb.subject().split_graphemes())
}
#[cfg(not(feature = "text-processing"))]
{
ma.subject().cmp(&mb.subject())
}
}
(SortField::Subject, SortOrder::Asc) => {
let a = &self.thread_nodes[&self.thread_ref(*a).root()].message();
@ -1252,18 +1246,12 @@ impl Threads {
}
let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{
mb.subject()
.as_ref()
.split_graphemes()
.cmp(&ma.subject().split_graphemes())
}
#[cfg(not(feature = "text-processing"))]
{
mb.subject().as_ref().cmp(&ma.subject())
}
}
});
}
@ -1303,16 +1291,11 @@ impl Threads {
}
let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{
ma.subject()
.split_graphemes()
.cmp(&mb.subject().split_graphemes())
}
#[cfg(not(feature = "text-processing"))]
{
ma.subject().cmp(&mb.subject())
}
}
(SortField::Subject, SortOrder::Asc) => {
let a = &self.thread_nodes[a].message();
@ -1332,18 +1315,12 @@ impl Threads {
}
let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{
mb.subject()
.as_ref()
.split_graphemes()
.cmp(&ma.subject().split_graphemes())
}
#[cfg(not(feature = "text-processing"))]
{
mb.subject().as_ref().cmp(&ma.subject())
}
}
});
}
@ -1379,16 +1356,11 @@ impl Threads {
}
let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{
ma.subject()
.split_graphemes()
.cmp(&mb.subject().split_graphemes())
}
#[cfg(not(feature = "text-processing"))]
{
ma.subject().cmp(&mb.subject())
}
}
(SortField::Subject, SortOrder::Asc) => {
let a = &self.thread_nodes[a].message();
@ -1408,18 +1380,12 @@ impl Threads {
}
let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{
mb.subject()
.as_ref()
.split_graphemes()
.cmp(&ma.subject().split_graphemes())
}
#[cfg(not(feature = "text-processing"))]
{
mb.subject().as_ref().cmp(&ma.subject())
}
}
});
}

View File

@ -40,7 +40,7 @@ required-features = ["melib/imap"]
[dependencies]
crossbeam = { version = "^0.8" }
meli = { path = "../meli", version = "0.8" }
melib = { path = "../melib", version = "0.8", features = ["debug-tracing", "unicode-algorithms"] }
melib = { path = "../melib", version = "0.8", features = ["debug-tracing" ] }
nix = { version = "^0.24", default-features = false }
signal-hook = { version = "^0.3", default-features = false, features = ["iterator"] }
signal-hook-registry = { version = "1.2.0", default-features = false }