diff options
Diffstat (limited to 'clustering_678.py')
-rwxr-xr-x | clustering_678.py | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/clustering_678.py b/clustering_678.py new file mode 100755 index 0000000..99391ac --- /dev/null +++ b/clustering_678.py @@ -0,0 +1,100 @@ +# coding=utf-8 +import pandas as pd +import unicodedata +from sklearn.cluster import KMeans + +PERSON_001_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\PERSON - 001.csv" +FEATURES_TABLE_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\features_678.csv" + + +# We find all keys with an empty value: +def return_keys_without_value(dic): + keys = [] + for key, value in dic.items(): + if value == "": # todo: take care of keys without value that do contain : somewhere + keys.append(key) + if len(keys) > 0: + return keys + else: + return None + + +data = pd.read_csv(PERSON_001_PATH, names=["string", "id"]) +data = data.dropna() +data = data[data["string"] != "{}"] +data["string"] = [eval(k) for k in data["string"]] +data["string"] = [return_keys_without_value(dic) for dic in data["string"]] +data = data.dropna() + +string_list = [] +id_list = [] +for _, row in data.iterrows(): + for elem in row["string"]: + string_list.append(elem) + id_list.append(row["id"]) +new_data = pd.DataFrame({"string": string_list, "id": id_list}) +new_data.to_csv(r"C:\Users\Emanuel\Desktop\DataHack16\data_separated.csv", encoding="utf-8") + + +def is_all_hebrew(s): + try: + s = s.decode("utf-8") + except AttributeError: + pass + + # remove all non-characters: + q = "" + for i in s: + if i.isalpha(): + q = "".join([q, i]) + + return all('HEBREW' in unicodedata.name(c) for c in q) + + +def is_all_english(s): + try: + s = s.decode("utf-8") + except AttributeError: + pass + + # remove all non-characters: + chars_only = "" + for i in s: + if i.isalpha(): + chars_only = "".join([chars_only, i]) + return all('LATIN' in unicodedata.name(c) for c in chars_only) + + +def count_words(s): + return len(s.split()) + + +# todo: add a feature "contains_predefined_year_prefixes", like b. or d. +# todo: add a feature that checks whether the string contains a number that is not a year (i.e not in the range ...) +# todo: detect hebrew years using quotes +new_data["is_all_hebrew"] = new_data["string"].apply(is_all_hebrew) +new_data["is_all_english"] = new_data["string"].apply(is_all_english) +new_data["number_of_words"] = new_data["string"].apply(count_words) +new_data["contains_quote"] = new_data["string"].apply(lambda s: '"' in s) +new_data["contains_colon"] = new_data["string"].apply(lambda s: ':' in s) +new_data.to_csv(FEATURES_TABLE_PATH) + +X = new_data.copy() +assert isinstance(X, pd.DataFrame) +del X["id"] +del X["string"] +print(X.columns) +X = (X - X.mean()) / (X.max() - X.min()) # normalizing the features + +range_n_clusters = [4, 6] +for n_clusters in range_n_clusters: + clusterer = KMeans(n_clusters=n_clusters, random_state=10) + cluster_labels = clusterer.fit_predict(X) + centers = clusterer.cluster_centers_ + print("\n %s clusters:" % n_clusters) + print("cluster labels: %s" % cluster_labels) + print("cluster centers: %s " % centers) + + for k in range(n_clusters): + print("\ncluster %d consists of the following strings:" % k) + print(new_data["string"][cluster_labels == k]) |