use std::collections::{BTreeMap, HashMap, HashSet};
use pyo3::prelude::*;
use pyo3::{exceptions, ObjectProtocol};
use fst::{IntoStreamer, Set};
use fst_levenshtein::Levenshtein;
use rayon::prelude::*;
use regex::{Regex};
use std::cmp::Reverse;
use pyo3::types::PyAny;
use itertools::Itertools;
use std::error::Error;
use std::fmt;
use strsim::normalized_damerau_levenshtein;

#[macro_use]
extern crate itertools;


trait CharTrigrams {
    fn char_quadgrams(&self) -> Vec<&str>;
    fn char_trigrams(&self) -> Vec<&str>;
    fn without_first(&self, i: usize) -> &str;
}

impl CharTrigrams for &str {
    fn char_quadgrams(&self) -> Vec<&str> {
        let indices: Vec<usize> = self.char_indices().map(|(i, _)| i).collect();

        if indices.len() < 4 {
            return vec![self];
        }
        let mut t = vec![];
        // println!("starts={:?} ends={:?}", indices[..indices.len() - 3].iter().collect::<Vec<_>>(), indices[4..].iter().chain(indices.len()).collect::<Vec<_>>());
        for (start, end) in indices[..indices.len() - 3].iter().zip(indices[4..].iter().chain(vec![indices.len()].iter())) {
            t.push(&self[*start..*end])
        }
        t
    }

    fn char_trigrams(&self) -> Vec<&str> {
        let indices: Vec<usize> = self.char_indices().map(|(i, _)| i).collect();

        if indices.len() < 3 {
            return vec![self];
        }
        let mut t = vec![];
        for (start, end) in indices[..indices.len() - 2].iter().zip(indices[2..].iter()) {
            t.push(&self[*start..*end])
        }
        t
    }

    fn without_first(&self, i: usize) -> &str {
        let indices: Vec<usize> = self.char_indices().map(|(i, _)| i).collect();

        if let Some(index) = indices.get(i) {
            &self[*index..]
        } else {
            ""
        }
    }
}

trait ImperfectLookup<U> {
    fn get_by_superstring(&self, key: &str) -> Vec<(&str, &U, u8)>;
    fn get_by_overlap(&self, key: &str, min_overlap: usize) -> Vec<(&str, &U)>;
    fn get_by_prefix(&self, key: &str, length_diff: usize) -> Vec<(&str, &U)>;
}

impl<U> ImperfectLookup<U> for BTreeMap<String, U> {
    fn get_by_superstring(&self, key: &str) -> Vec<(&str, &U, u8)> {
        key.char_quadgrams().iter()
            .flat_map(|sub_key| {
                let sub_key_cloned = sub_key.to_string();
                self.range((*sub_key).to_owned()..)
                    .take_while(move |(k, _v)| k.starts_with(sub_key_cloned.as_str()))
                    .filter(|(k, _v)| key.contains(k.as_str()))
            })
            .unique_by(|(k, _)| k.as_str())
            .map(|(k, v)| (
                k.as_str(),
                v,
                (100.0 * normalized_damerau_levenshtein(k, key)) as u8
            ))
            .sorted_by_key(|(k, v, distance)| Reverse(distance.clone()))
            .collect()
    }

    fn get_by_prefix(&self, key: &str, length_diff: usize) -> Vec<(&str, &U)> {
        self
            .range(key.to_owned()..)
            .take_while(|(k, _v)| k.starts_with(key))
            .filter(|(k, _v)| k.len() == key.len() + length_diff)
            .map(|(k, v)| (k.as_str(), v))
            .collect()
    }

    fn get_by_overlap(&self, key: &str, min_overlap: usize) -> Vec<(&str, &U)> {
        key
            .char_quadgrams().iter().enumerate()
            .filter(|(index, _sub_key)| key.len() - *index >= min_overlap)
            .flat_map(|(index, sub_key)| {
                let sub_key_cloned = sub_key.to_string();
                self.range((*sub_key).to_owned()..)
                    .take_while(move |(k, _v)| k.starts_with(sub_key_cloned.as_str()))
                    .filter(move |(k, _v)| k.starts_with(key.without_first(index)))
            })
            .unique_by(|(k, _)| k.as_str())
            .map(|(k, v)| (k.as_str(), v))
            .collect()
    }
}

#[pyclass]
struct StringAnalysisDict {
    map: BTreeMap<String, PyObject>,
    fst: Option<Set>,
    quadgram_index: HashMap<String, HashSet<String>>,
}

impl StringAnalysisDict {
    fn new() -> Self {
        StringAnalysisDict {
            map: Default::default(),
            fst: Default::default(),
            quadgram_index: Default::default(),
        }
    }
}

fn err(message: &str) -> PyErr {
    exceptions::ValueError::py_err(message.to_string())
}

struct FuzzyError {
    msg: String,
}

impl fmt::Display for FuzzyError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "Fuzzy err {}", self.msg) // user-facing output
    }
}

impl fmt::Debug for FuzzyError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{{ msg: {}, file: {}, line: {} }}", self.msg, file!(), line!()) // programmer-facing output
    }
}

impl StringAnalysisDict {
    fn _get_fuzzy(&self, key: Option<&str>, distance: u32) -> Result<Vec<(String, &PyObject)>, FuzzyError> {
        if let Some(key) = key {
            if let Some(map) = &self.fst {
                let lev = Levenshtein::new(key, distance).map_err(|e| FuzzyError { msg: "Could not create fst".to_string() })?;
                let keys = map.search(lev).into_stream().into_strs().map_err(|e| FuzzyError { msg: "Could not stream keys".to_string() })?;
                return Ok(keys.iter().filter_map(|k| self.get(Some(k)).map(|v| (k.to_string(), v))).collect());
            }
        }
        Ok(vec![])
    }

    fn _get_by_superstring(&self, k: Option<&str>, min_length: usize) -> Vec<(&str, &PyObject, u8)> {
        if let Some(unwrapped) = k {
            if unwrapped.len() < min_length {
                return vec![];
            }
        }

        k.map(|k| self.map.get_by_superstring(k)).unwrap_or(vec![])
    }
}

#[pymethods]
impl StringAnalysisDict {
    fn get(&self, k: Option<&str>) -> Option<&PyObject> {
        k.and_then(|k| self.map.get(k))
    }

    #[args(min_length = 0)]
    fn get_by_prefix(&self, k: Option<&str>, length_diff: usize, min_length: usize) -> Vec<(&str, &PyObject)> {
        if let Some(unwrapped) = k {
            if unwrapped.len() < min_length {
                return vec![];
            }
        }

        k.map(|k| self.map.get_by_prefix(k, length_diff)).unwrap_or(vec![])
    }

    #[args(min_length = 0)]
    fn get_by_prefix_vectorized(&self, py: Python, kss: Vec<Option<&str>>, length_diff: usize, min_length: usize) -> Vec<Vec<(&str, &PyObject)>> {
        py.allow_threads(move || kss.par_iter().map(|ks| self.get_by_prefix(ks.as_ref().map(|i| *i), length_diff, min_length)).collect())
    }

    #[args(min_length = 0)]
    fn get_by_any_prefix(&self, ks: Vec<Option<&str>>, length_diff: usize, min_length: usize) -> Vec<(&str, &PyObject)> {
        ks.iter().flat_map(|k| self.get_by_prefix(k.as_ref().map(|i| *i), length_diff, min_length)).collect()
    }

    #[args(min_length = 0)]
    fn get_by_any_prefix_vectorized(&self, py: Python, kss: Vec<Vec<Option<&str>>>, length_diff: usize, min_length: usize) -> Vec<Vec<(&str, &PyObject)>> {
        py.allow_threads(move || kss.iter().map(|ks| self.get_by_any_prefix(ks.to_vec(), length_diff, min_length)).collect())
    }

    #[args(min_overlap = 0)]
    fn get_by_overlap(&self, k: Option<&str>, min_overlap: usize) -> Vec<(&str, &PyObject)> {
        k.map(|k| self.map.get_by_overlap(k, min_overlap)).unwrap_or(vec![])
    }

    #[args(min_overlap = 0)]
    fn get_by_overlap_vectorized(&self, py: Python, k: Vec<Option<&str>>, min_overlap: usize) -> Vec<Vec<(&str, &PyObject)>> {
        py.allow_threads(move || k.par_iter().map(|ks| self.get_by_overlap(ks.as_ref().map(|i| *i), min_overlap)).collect())
    }

    #[args(min_length = 0)]
    fn get_by_superstring(&self, k: Option<&str>, min_length: usize) -> Vec<(&str, &PyObject)> {
        self
            ._get_by_superstring(k, min_length)
            .iter()
            .map(|(k, v, d)| (*k, *v))
            .collect()
    }

    #[args(min_length = 0)]
    fn get_by_superstring_vectorized(&self, py: Python, k: Vec<Option<&str>>, min_length: usize) -> Vec<Vec<(&str, &PyObject)>> {
        py.allow_threads(move || k.par_iter().map(|ks| self.get_by_superstring(ks.as_ref().map(|i| *i), min_length)).collect())
    }

    #[args(min_length = 0)]
    fn get_by_any_superstring(&self, ks: Vec<Option<&str>>, min_length: usize) -> Vec<(&str, &PyObject)> {
        ks
            .iter()
            .flat_map(|k| {
                self._get_by_superstring(k.as_ref().map(|i| *i), min_length)
            })
            .sorted_by_key(|(k, v, d)| Reverse(d.clone()))
            .map(|(k, v, d)| (k, v))
            .unique_by(|(k, _)| *k)
            .collect()
    }

    #[args(min_length = 0)]
    fn get_by_any_superstring_vectorized(&self, py: Python, kss: Vec<Vec<Option<&str>>>, min_length: usize) -> Vec<Vec<(&str, &PyObject)>> {
        py.allow_threads(move || kss.par_iter().map(|ks| {
            self.get_by_any_superstring(ks.to_vec(), min_length)
        }).collect())
    }

    fn insert(&mut self, k: Option<&str>, v: PyObject) {
        if let Some(k) = k {
            self.map.insert(k.to_string(), v);
            k.char_quadgrams().iter().for_each(|q| {
                (*self.quadgram_index.entry(q.to_string()).or_insert(Default::default())).insert(k.to_string());
            })
        }
    }

    fn insert_pairs(&mut self, pairs: Vec<(Option<&str>, PyObject)>) {
        for (k, v) in pairs {
            self.insert(k, v);
        }
    }

    fn insert_keys_and_values(&mut self, keys: Vec<Option<&str>>, values: Vec<PyObject>) {
        for (k, v) in keys.iter().zip(values) {
            self.insert(k.as_ref().map(|i| *i), v);
        }
    }

    fn keys(&self) -> Vec<String> {
        self.map.keys().map(|i| i.to_string()).collect()
    }

    fn finalize(&mut self) -> PyResult<()> {
        if self.fst.is_some() {
            return Ok(());
        }

        let fst = Set::from_iter(self.map.iter().map(|(k, _v)| k.as_str()))
            .map_err(|_e| err("could not create fst Set"))?;
        self.fst = Some(fst);

        Ok(())
    }

    fn get_fuzzy(&self, key: Option<&str>, distance: u32) -> PyResult<Vec<(String, &PyObject)>> {
        self._get_fuzzy(key, distance).map_err(|e| err(format!("{:?}", e).as_str()))
    }

    fn get_fuzzy_vectorized(&self, py: Python, key: Vec<Option<&str>>, distance: u32) -> PyResult<Vec<Vec<(String, &PyObject)>>> {
        py.allow_threads(move || key.par_iter().map(|ks| self._get_fuzzy(ks.as_ref().map(|i| *i), distance)).collect::<Result<_, _>>().map_err(|e| err(format!("{:?}", e).as_str())))
    }
}

fn _order_by_token_alignment_vectorized(truths: &Vec<Option<&str>>, vecs: &Vec<Vec<Option<&str>>>) -> Vec<Vec<usize>> {
    let re = Regex::new(r"[\s\-/(),+]+").unwrap();

    return truths.par_iter().zip(vecs.par_iter()).map(|(truth, options)| {
        if let Some(truth) = truth {
            let upper_tokens: Vec<String> = re.split(truth)
                .filter(|i| i.len() > 3)
                .map(|i| i.to_uppercase())
                .collect();
            let truth_tokens: Vec<&str> = upper_tokens.iter().map(|i| i.as_str()).collect();
            let mut sorted: Vec<_> = options.iter()
                .enumerate()
                .filter_map(|(index, option)| option.as_ref().map(|o| (index, o)))
                .map(|(index, o)| {
                    let upper = o.to_uppercase();
                    let c = re.split(&upper)
                        .filter(|i| i.len() > 3)
                        .filter(|i| truth_tokens.contains(i))
                        .count();
                    (index, o, c)
                })
                .filter(|(_, _, count)| *count > 0)
                .collect();
            sorted.sort_by_key(|(_, _, count)| Reverse(*count));

            sorted.iter().map(|(index, _, _)| *index).collect()
        } else {
            vec![]
        }
    }).collect();
}

fn _sort_list_of_lists(list: Vec<Vec<(&PyAny, i32)>>) -> Vec<Vec<&PyAny>> {
    list.par_iter().map(|i| {
        let mut sorted = i.to_vec();
        sorted.sort_by_key(|(_, i)| Reverse(*i));
        sorted.iter().map(|(v, _)| *v).collect()
    }).collect()
}

fn _regex_parts(examples: Vec<(&str, &str)>, percent_required: i32) -> Result<Option<(Vec<String>, Vec<i32>)>, Box<Error>> {
    let separators = vec!["\\.", "-", "/", "_", ""];
    let not_separators_regex = "[^\\.\\-._]+?";

    let specificity_1: Vec<(&str, u16)> = vec!["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "Q", "X", "Y", "Z", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]
        .iter()
        .filter(|i| (&examples).iter().all(|(actual, expected)| (*actual).contains(**i)))
        .map(|l| (*l, 100)).collect();
    let specificity_2: Vec<(&str, u16)> = vec!["\\d", "[A-Z]"].iter().map(|l| (*l, 10)).collect();
    let specificity_3: Vec<(&str, u16)> = vec!["\\d+", "[A-Z]+"].iter().map(|l| (*l, 1)).collect();
    let specificity_4: Vec<(&str, u16)> = vec![not_separators_regex, ""].iter().map(|l| (*l, 0)).collect();

    let all_seps: Vec<_> = iproduct!(&separators, &separators).map(|(a, b)| (*a, *b)).collect();

    let all: Vec<&(&str, u16)> = specificity_1.iter()
        .chain(specificity_2.iter())
        .chain(specificity_3.iter())
        .chain(specificity_4.iter())
        .collect();

//    let sets: Result<Vec<_>, _> = iproduct!(&all, &all, &all, &all_seps)
//        .filter(|(p0, p1, p2, _)| p0.1 + p1.1 + p2.1 < 220)
//        .map(|(p0, p1, p2, (sep1, sep2))| format!("^({})({})({})({})({})$", p0.0, sep1, p1.0, sep2, p2.0))
//        .chunks(100)
//        .into_iter()
//        .map(|s| RegexSet::new(s))
//        .collect();
//    let sets = sets?;

    let best = iproduct!(&all, &all, &all, all_seps)
        .filter(|(p0, p1, p2, _)| p0.1 + p1.1 + p2.1 < 220)
        .par_bridge()
        .map(|(p0, p1, p2, (sep1, sep2))| {
            let re = Regex::new(format!("^({})({})({})({})({})$", p0.0, sep1, p1.0, sep2, p2.0).as_str()).unwrap();
            let parts = (p0.0, p1.0, p2.0);
            let score = p0.1 + p1.1 + p2.1;
            (re, parts, score, sep1, sep2)
        })
        .filter_map(move |(re, parts, score, sep1, sep2)| {
            vec![
                vec![1usize, 2usize, 3usize],
                vec![1usize, 2usize, 4usize],
                vec![1usize, 2usize, 5usize],
                vec![1usize, 2usize, 3usize, 4usize],
                vec![1usize, 2usize, 3usize, 5usize]
            ]
                .iter()
                .find(|indices| {
                    examples
                        .iter()
                        .filter(|(actual, expected)| {
                            let all_groups = re.captures_iter(actual).collect::<Vec<_>>();
                            if let Some(groups) = all_groups.get(0) {
                                let non_blank_groups: Vec<_> = groups.iter().filter_map(|g| g).map(|i| i.as_str()).filter(|g| !g.is_empty()).collect();
                                let transformed = indices.iter().filter_map(|i| non_blank_groups.get(*i)).join("");
                                &transformed.as_str() == expected
                            } else {
                                false
                            }
                        })
                        .count() >= (examples.len() as f32 * (0.01 * percent_required as f32)) as usize
                })
                .map(|indices| {
                    (parts, score, sep1, sep2, indices.to_vec())
                })
        })
        .reduce(
            || (Default::default(), Default::default(), Default::default(), Default::default(), vec![]),
            |a, b| if a.1 > b.1 { a } else { b },
        );

    if best.1 == 0 {
        return Ok(None);
    }

    let mut results: Vec<String> = vec![(best.0).0.to_string()];
    if best.2 != "" {
        results.push(best.2.to_string());
    }
    results.push((best.0).1.to_string());
    if best.3 != "" {
        results.push(best.3.to_string());
    }
    results.push((best.0).2.to_string());
    Ok(Some((results, best.4.iter().map(|i| (*i) as i32).collect())))
}

#[pymodule]
fn stringanalysis(_py: Python, m: &PyModule) -> PyResult<()> {
    #[pyfn(m, "stringanalysisdict")]
    fn stringanalysisdict(_py: Python) -> PyResult<StringAnalysisDict> {
        Ok(StringAnalysisDict::new())
    }

    #[pyfn(m, "order_by_token_alignment_vectorized")]
    fn order_by_token_alignment_vectorized(py: Python, truths: Vec<Option<&str>>, vecs: Vec<Vec<Option<&str>>>) -> Vec<Vec<usize>> {
        py.allow_threads(move || {
            _order_by_token_alignment_vectorized(&truths, &vecs)
        })
    }

    #[pyfn(m, "sort_list_of_lists")]
    fn sort_list_of_list<'a>(py: Python, list: Vec<Vec<(&'a PyAny, i32)>>) -> Vec<Vec<&'a PyAny>> {
        py.allow_threads(move || {
            _sort_list_of_lists(list)
        })
    }

    #[pyfn(m, "regex_parts")]
    fn regex_parts<'a>(py: Python, examples: Vec<(&str, &str)>, percent_required: i32) -> PyResult<Option<(Vec<String>, Vec<i32>)>> {
        py.allow_threads(move || {
            _regex_parts(examples, percent_required).map_err(|e| exceptions::ValueError::py_err(format!("{:?}", e)))
        })
    }

    Ok(())
}

#[cfg(test)]
mod tests {
    use crate::CharTrigrams;

    #[test]
    fn gets_quadgrams() {
        assert_eq!("abcde".char_quadgrams(), vec!["abcd", "bcde"])
    }
}