Make unicode-segmentation a hard dependency

meli/melib are UTF8 software, so we should have proper Unicode support. A compile-time env var is added, `UNICODE_REGENERATE_TABLES` to force network access and rebuild the cached unicode tables. Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is>
2024-04-11 21:15:47 +03:00 · 2024-04-11 21:15:47 +03:00 · ae96038fbf
parent 07072e2e3f
commit ae96038fbf
13 changed files with 16 additions and 147 deletions
--- a/BUILD.md
+++ b/BUILD.md
@ -3,7 +3,7 @@
 For a quick start, build and install locally:

 ```sh
- PREFIX=~/.local make install
+PREFIX=~/.local make install
 ```

 Available subcommands for `make` are listed with `make help`.
@ -34,6 +34,9 @@ Some functionality is held behind "feature gates", or compile-time flags. The fo
  Since it's actual use in the code is very limited, it is not recommended to use this (off by default).
 - `static` and `*-static` bundle C libraries in dependencies so that you don't need them installed in your system (on by default).

+Though not a feature, the presence of the environment variable `UNICODE_REGENERATE_TABLES` in compile-time of the `melib` crate will force the regeneration of unicode tables.
+Otherwise the tables are included with the source code, and there's no real reason to regenerate them unless you intend to modify the code or update to a new Unicode version.
+
 ## Build Debian package (*deb*)

 Building with Debian's packaged cargo might require the installation of these two packages: `librust-openssl-sys-dev librust-libdbus-sys-dev`
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@ -14,10 +14,7 @@ path = "fuzz_targets/envelope_parse.rs"

 [dependencies]
 libfuzzer-sys = "0.3"
-
-[dependencies.melib]
-path = "../melib"
-features = ["unicode-algorithms"]
+melib = { path = "../melib" }

 # Prevent this from interfering with workspaces
 [workspace]
--- a/meli/Cargo.toml
+++ b/meli/Cargo.toml
@ -31,7 +31,7 @@ indexmap = { version = "^1.6", features = ["serde-1"] }
 libc = { version = "0.2.125", default-features = false, features = ["extra_traits"] }
 libz-sys = { version = "1.1", features = ["static"], optional = true }
 linkify = { version = "^0.8", default-features = false }
-melib = { path = "../melib", version = "0.8.5-rc.3", features = ["unicode-algorithms"] }
+melib = { path = "../melib", version = "0.8.5-rc.3", features = [] }
 nix = { version = "0.27", default-features = false, features = ["signal", "poll", "term", "ioctl", "process"] }
 notify = { version = "4.0.1", default-features = false } # >:c
 num_cpus = "1.12.0"
--- a/melib/Cargo.toml
+++ b/melib/Cargo.toml
@ -50,7 +50,7 @@ serde_path_to_error = { version = "0.1" }
 smallvec = { version = "^1.5.0", features = ["serde"] }
 smol = "1.0.0"
 socket2 = { version = "0.5", features = [] }
-unicode-segmentation = { version = "1.2.1", default-features = false, optional = true }
+unicode-segmentation = { version = "1.2.1", default-features = false }
 url = { version = "2.4", optional = true }
 uuid = { version = "^1", features = ["serde", "v4", "v5"] }
 xdg = "2.1.0"
@ -77,9 +77,6 @@ sqlite3 = ["rusqlite"]
 sqlite3-static = ["sqlite3", "rusqlite/bundled-full"]
 tls = ["native-tls"]
 tls-static = ["tls", "native-tls/vendored"]
-text-processing = []
-unicode-algorithms = ["text-processing", "unicode-segmentation"]
-unicode-algorithms-cached = ["text-processing", "unicode-segmentation"]
 vcard = []

 [build-dependencies]
--- a/melib/README.md
+++ b/melib/README.md
@ -22,24 +22,6 @@ Library for handling mail.
 |------------------------------|-------------------------------------|--------------------------|
 | `sqlite`                     | `rusqlite`                          | Used in IMAP cache.      |
 |------------------------------|-------------------------------------|--------------------------|
-| `unicode-algorithms`         | `unicode-segmentation`              | Linebreaking algo etc    |
-|                              |                                     | For a fresh clean build, |
-|                              |                                     | Network access is        |
-|                              |                                     | required to fetch data   |
-|                              |                                     | from Unicode's website.  |
-|------------------------------|-------------------------------------|--------------------------|
-| `unicode-algorithms-cached`  | `unicode-segmentation`              | Linebreaking algo etc    |
-|                              |                                     | but it uses a cached     |
-|                              |                                     | version of Unicode data  |
-|                              |                                     | which might be stale.    |
-|                              |                                     |                          |
-|                              |                                     | Use this feature instead |
-|                              |                                     | of the previous one for  |
-|                              |                                     | building without network |
-|                              |                                     | access.                  |
-|------------------------------|-------------------------------------|--------------------------|
-| `unicode-algorithms`         | `unicode-segmentation`              |                          |
-|------------------------------|-------------------------------------|--------------------------|
 | `vcard`                      |                                     | vcard parsing            |
 |------------------------------|-------------------------------------|--------------------------|
 | `gpgme`                      |                                     | GPG use with libgpgme    |
--- a/melib/build.rs
+++ b/melib/build.rs
@ -21,15 +21,14 @@

 #![allow(clippy::needless_range_loop)]

-#[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
 include!("src/text/types.rs");

 fn main() -> Result<(), std::io::Error> {
-    #[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
    {
        const MOD_PATH: &str = "src/text/tables.rs";
+        println!("cargo:rerun-if-env-changed=UNICODE_REGENERATE_TABLES");
        println!("cargo:rerun-if-changed=build.rs");
-        println!("cargo:rerun-if-changed={}", MOD_PATH);
+        println!("cargo:rerun-if-changed={MOD_PATH}");
        /* Line break tables */
        use std::{
            fs::File,
@ -54,7 +53,7 @@ fn main() -> Result<(), std::io::Error> {
            );
            return Ok(());
        }
-        if cfg!(feature = "unicode-algorithms-cached") {
+        if std::env::var("UNICODE_REGENERATE_TABLES").is_err() {
            const CACHED_MODULE: &[u8] = include_bytes!(concat!("./src/text/tables.rs.gz"));

            let mut gz = GzDecoder::new(CACHED_MODULE);
--- a/melib/src/email/compose/mime.rs
+++ b/melib/src/email/compose/mime.rs
@ -20,14 +20,12 @@
 */

 use super::*;
-#[cfg(feature = "text-processing")]
 use crate::text::grapheme_clusters::TextProcessing;

 pub fn encode_header(value: &str) -> String {
    let mut ret = String::with_capacity(value.len());
    let mut is_current_window_ascii = true;
    let mut current_window_start = 0;
-    #[cfg(feature = "text-processing")]
    {
        let graphemes = value.graphemes_indices();
        for (idx, g) in graphemes {
@ -81,63 +79,6 @@ pub fn encode_header(value: &str) -> String {
            }
        }
    }
-    #[cfg(not(feature = "text-processing"))]
-    {
-        /* [ref:VERIFY] [ref:TODO]: test this. If it works as fine as the one above, there's no need to
-         * keep the above implementation. */
-        for (i, g) in value.char_indices() {
-            match (g.is_ascii(), is_current_window_ascii) {
-                (true, true) => {
-                    ret.push(g);
-                }
-                (true, false) => {
-                    /* If !g.is_whitespace()
-                     *
-                     * Whitespaces inside encoded tokens must be greedily taken,
-                     * instead of splitting each non-ascii word into separate encoded tokens. */
-                    if !g.is_whitespace() && value.is_char_boundary(i) {
-                        ret.push_str(&format!(
-                            "=?UTF-8?B?{}?=",
-                            BASE64_MIME
-                                .encode(value[current_window_start..i].as_bytes())
-                                .trim()
-                        ));
-                        if i != value.len() - 1 {
-                            ret.push(' ');
-                        }
-                        is_current_window_ascii = true;
-                        current_window_start = i;
-                        ret.push(g);
-                    }
-                }
-                (false, true) => {
-                    current_window_start = i;
-                    is_current_window_ascii = false;
-                }
-                /* RFC2047 recommends:
-                 * 'While there is no limit to the length of a multiple-line header field, each
-                 * line of a header field that contains one or more
-                 * 'encoded-word's is limited to 76 characters.'
-                 * This is a rough compliance.
-                 */
-                (false, false)
-                    if value.is_char_boundary(i) && value[current_window_start..i].len() > 76 =>
-                {
-                    ret.push_str(&format!(
-                        "=?UTF-8?B?{}?=",
-                        BASE64_MIME
-                            .encode(value[current_window_start..i].as_bytes())
-                            .trim()
-                    ));
-                    if i != value.len() - 1 {
-                        ret.push(' ');
-                    }
-                    current_window_start = i;
-                }
-                (false, false) => {}
-            }
-        }
-    }
    /* If the last part of the header value is encoded, it won't be pushed inside
     * the previous for block */
    if !is_current_window_ascii {
--- a/melib/src/lib.rs
+++ b/melib/src/lib.rs
@ -132,7 +132,6 @@ pub mod dbg {
    }
 }

-#[cfg(feature = "text-processing")]
 pub mod text;

 pub use utils::{
--- a/melib/src/text/grapheme_clusters.rs
+++ b/melib/src/text/grapheme_clusters.rs
@ -29,12 +29,12 @@

 */

+use unicode_segmentation::UnicodeSegmentation;
+
 use super::{
    types::Reflow,
    wcwidth::{wcwidth, CodePointsIter},
 };
-extern crate unicode_segmentation;
-use self::unicode_segmentation::UnicodeSegmentation;

 pub trait TextProcessing: UnicodeSegmentation + CodePointsIter {
    fn split_graphemes(&self) -> Vec<&str> {
--- a/melib/src/text/line_break.rs
+++ b/melib/src/text/line_break.rs
@ -19,12 +19,11 @@
 * along with meli. If not, see <http://www.gnu.org/licenses/>.
 */

-extern crate unicode_segmentation;
 use std::{cmp::Ordering, collections::VecDeque, iter::Peekable, str::FromStr};

+use unicode_segmentation::UnicodeSegmentation;
 use LineBreakClass::*;

-use self::unicode_segmentation::UnicodeSegmentation;
 use super::{
    grapheme_clusters::TextProcessing,
    tables::LINE_BREAK_RULES,
--- a/melib/src/text/mod.rs
+++ b/melib/src/text/mod.rs
@ -19,6 +19,8 @@
 * along with meli. If not, see <http://www.gnu.org/licenses/>.
 */

+use unicode_segmentation::UnicodeSegmentation;
+
 pub mod grapheme_clusters;
 pub mod line_break;
 pub mod search;
@ -43,8 +45,6 @@ impl Truncate for &str {
            return;
        }

-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
        if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
            .take(new_len)
            .last()
@ -58,8 +58,6 @@ impl Truncate for &str {
            return self;
        }

-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
        if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
            .take(new_len)
            .last()
@ -75,8 +73,6 @@ impl Truncate for &str {
            return "";
        }

-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
        if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
            &self[first..]
        } else {
@ -90,8 +86,6 @@ impl Truncate for &str {
            return;
        }

-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
        if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
            *self = &self[first..];
        }
@ -104,8 +98,6 @@ impl Truncate for String {
            return;
        }

-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
        if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
            .take(new_len)
            .last()
@ -119,8 +111,6 @@ impl Truncate for String {
            return self;
        }

-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
        if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
            .take(new_len)
            .last()
@ -136,8 +126,6 @@ impl Truncate for String {
            return "";
        }

-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
        if let Some((first, _)) =
            UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
        {
@ -153,8 +141,6 @@ impl Truncate for String {
            return;
        }

-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
        if let Some((first, _)) =
            UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
        {
--- a/melib/src/thread.rs
+++ b/melib/src/thread.rs
@ -52,7 +52,6 @@ pub use iterators::*;
 use smallvec::SmallVec;
 use uuid::Uuid;

-#[cfg(feature = "text-processing")]
 use crate::text::grapheme_clusters::*;

 type Envelopes = Arc<RwLock<HashMap<EnvelopeHash, Envelope>>>;
@ -1223,16 +1222,11 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                {
                    ma.subject()
                        .split_graphemes()
                        .cmp(&mb.subject().split_graphemes())
                }
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    ma.subject().cmp(&mb.subject())
-                }
            }
            (SortField::Subject, SortOrder::Asc) => {
                let a = &self.thread_nodes[&self.thread_ref(*a).root()].message();
@ -1252,18 +1246,12 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                {
                    mb.subject()
                        .as_ref()
                        .split_graphemes()
                        .cmp(&ma.subject().split_graphemes())
                }
-
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    mb.subject().as_ref().cmp(&ma.subject())
-                }
            }
        });
    }
@ -1303,16 +1291,11 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                {
                    ma.subject()
                        .split_graphemes()
                        .cmp(&mb.subject().split_graphemes())
                }
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    ma.subject().cmp(&mb.subject())
-                }
            }
            (SortField::Subject, SortOrder::Asc) => {
                let a = &self.thread_nodes[a].message();
@ -1332,18 +1315,12 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                {
                    mb.subject()
                        .as_ref()
                        .split_graphemes()
                        .cmp(&ma.subject().split_graphemes())
                }
-
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    mb.subject().as_ref().cmp(&ma.subject())
-                }
            }
        });
    }
@ -1379,16 +1356,11 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                {
                    ma.subject()
                        .split_graphemes()
                        .cmp(&mb.subject().split_graphemes())
                }
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    ma.subject().cmp(&mb.subject())
-                }
            }
            (SortField::Subject, SortOrder::Asc) => {
                let a = &self.thread_nodes[a].message();
@ -1408,18 +1380,12 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                {
                    mb.subject()
                        .as_ref()
                        .split_graphemes()
                        .cmp(&ma.subject().split_graphemes())
                }
-
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    mb.subject().as_ref().cmp(&ma.subject())
-                }
            }
        });
    }
--- a/tools/Cargo.toml
+++ b/tools/Cargo.toml
@ -40,7 +40,7 @@ required-features = ["melib/imap"]
 [dependencies]
 crossbeam = { version = "^0.8" }
 meli = { path = "../meli", version = "0.8" }
-melib = { path = "../melib", version = "0.8", features = ["debug-tracing", "unicode-algorithms"] }
+melib = { path = "../melib", version = "0.8", features = ["debug-tracing" ] }
 nix = { version = "^0.24", default-features = false }
 signal-hook = { version = "^0.3", default-features = false, features = ["iterator"] }
 signal-hook-registry = { version = "1.2.0", default-features = false }