clustering_678.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

# coding=utf-8
import pandas as pd
import unicodedata
from sklearn.cluster import KMeans

PERSON_001_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\PERSON - 001.csv"
FEATURES_TABLE_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\features_678.csv"


# We find all keys with an empty value:
def return_keys_without_value(dic):
    keys = []
    for key, value in dic.items():
        if value == "":  # todo: take care of keys without value that do contain : somewhere
            keys.append(key)
    if len(keys) > 0:
        return keys
    else:
        return None


data = pd.read_csv(PERSON_001_PATH, names=["string", "id"])
data = data.dropna()
data = data[data["string"] != "{}"]
data["string"] = [eval(k) for k in data["string"]]
data["string"] = [return_keys_without_value(dic) for dic in data["string"]]
data = data.dropna()

string_list = []
id_list = []
for _, row in data.iterrows():
    for elem in row["string"]:
        string_list.append(elem)
        id_list.append(row["id"])
new_data = pd.DataFrame({"string": string_list, "id": id_list})
new_data.to_csv(r"C:\Users\Emanuel\Desktop\DataHack16\data_separated.csv", encoding="utf-8")


def is_all_hebrew(s):
    try:
        s = s.decode("utf-8")
    except AttributeError:
        pass

    # remove all non-characters:
    q = ""
    for i in s:
        if i.isalpha():
            q = "".join([q, i])

    return all('HEBREW' in unicodedata.name(c) for c in q)


def is_all_english(s):
    try:
        s = s.decode("utf-8")
    except AttributeError:
        pass

    # remove all non-characters:
    chars_only = ""
    for i in s:
        if i.isalpha():
            chars_only = "".join([chars_only, i])
    return all('LATIN' in unicodedata.name(c) for c in chars_only)


def count_words(s):
    return len(s.split())


# todo: add a feature "contains_predefined_year_prefixes", like b. or d.
# todo: add a feature that checks whether the string contains a number that is not a year (i.e not in the range ...)
# todo: detect hebrew years using quotes
new_data["is_all_hebrew"] = new_data["string"].apply(is_all_hebrew)
new_data["is_all_english"] = new_data["string"].apply(is_all_english)
new_data["number_of_words"] = new_data["string"].apply(count_words)
new_data["contains_quote"] = new_data["string"].apply(lambda s: '"' in s)
new_data["contains_colon"] = new_data["string"].apply(lambda s: ':' in s)
new_data.to_csv(FEATURES_TABLE_PATH)

X = new_data.copy()
assert isinstance(X, pd.DataFrame)
del X["id"]
del X["string"]
print(X.columns)
X = (X - X.mean()) / (X.max() - X.min())  # normalizing the features

range_n_clusters = [4, 6]
for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)
    centers = clusterer.cluster_centers_
    print("\n %s clusters:" % n_clusters)
    print("cluster labels: %s" % cluster_labels)
    print("cluster centers: %s " % centers)

    for k in range(n_clusters):
        print("\ncluster %d consists of the following strings:" % k)
        print(new_data["string"][cluster_labels == k])