summaryrefslogtreecommitdiff
path: root/clustering_678.py
diff options
context:
space:
mode:
Diffstat (limited to 'clustering_678.py')
-rwxr-xr-xclustering_678.py100
1 files changed, 100 insertions, 0 deletions
diff --git a/clustering_678.py b/clustering_678.py
new file mode 100755
index 0000000..99391ac
--- /dev/null
+++ b/clustering_678.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+import pandas as pd
+import unicodedata
+from sklearn.cluster import KMeans
+
+PERSON_001_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\PERSON - 001.csv"
+FEATURES_TABLE_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\features_678.csv"
+
+
+# We find all keys with an empty value:
+def return_keys_without_value(dic):
+ keys = []
+ for key, value in dic.items():
+ if value == "": # todo: take care of keys without value that do contain : somewhere
+ keys.append(key)
+ if len(keys) > 0:
+ return keys
+ else:
+ return None
+
+
+data = pd.read_csv(PERSON_001_PATH, names=["string", "id"])
+data = data.dropna()
+data = data[data["string"] != "{}"]
+data["string"] = [eval(k) for k in data["string"]]
+data["string"] = [return_keys_without_value(dic) for dic in data["string"]]
+data = data.dropna()
+
+string_list = []
+id_list = []
+for _, row in data.iterrows():
+ for elem in row["string"]:
+ string_list.append(elem)
+ id_list.append(row["id"])
+new_data = pd.DataFrame({"string": string_list, "id": id_list})
+new_data.to_csv(r"C:\Users\Emanuel\Desktop\DataHack16\data_separated.csv", encoding="utf-8")
+
+
+def is_all_hebrew(s):
+ try:
+ s = s.decode("utf-8")
+ except AttributeError:
+ pass
+
+ # remove all non-characters:
+ q = ""
+ for i in s:
+ if i.isalpha():
+ q = "".join([q, i])
+
+ return all('HEBREW' in unicodedata.name(c) for c in q)
+
+
+def is_all_english(s):
+ try:
+ s = s.decode("utf-8")
+ except AttributeError:
+ pass
+
+ # remove all non-characters:
+ chars_only = ""
+ for i in s:
+ if i.isalpha():
+ chars_only = "".join([chars_only, i])
+ return all('LATIN' in unicodedata.name(c) for c in chars_only)
+
+
+def count_words(s):
+ return len(s.split())
+
+
+# todo: add a feature "contains_predefined_year_prefixes", like b. or d.
+# todo: add a feature that checks whether the string contains a number that is not a year (i.e not in the range ...)
+# todo: detect hebrew years using quotes
+new_data["is_all_hebrew"] = new_data["string"].apply(is_all_hebrew)
+new_data["is_all_english"] = new_data["string"].apply(is_all_english)
+new_data["number_of_words"] = new_data["string"].apply(count_words)
+new_data["contains_quote"] = new_data["string"].apply(lambda s: '"' in s)
+new_data["contains_colon"] = new_data["string"].apply(lambda s: ':' in s)
+new_data.to_csv(FEATURES_TABLE_PATH)
+
+X = new_data.copy()
+assert isinstance(X, pd.DataFrame)
+del X["id"]
+del X["string"]
+print(X.columns)
+X = (X - X.mean()) / (X.max() - X.min()) # normalizing the features
+
+range_n_clusters = [4, 6]
+for n_clusters in range_n_clusters:
+ clusterer = KMeans(n_clusters=n_clusters, random_state=10)
+ cluster_labels = clusterer.fit_predict(X)
+ centers = clusterer.cluster_centers_
+ print("\n %s clusters:" % n_clusters)
+ print("cluster labels: %s" % cluster_labels)
+ print("cluster centers: %s " % centers)
+
+ for k in range(n_clusters):
+ print("\ncluster %d consists of the following strings:" % k)
+ print(new_data["string"][cluster_labels == k])