Commit 3a6b1276 authored by cristian.bell's avatar cristian.bell
Browse files

misc changes

parent 7bf402ce
Analyzing anonymized buying habits
--------------------------------------------------------------
This is the tool used to generate purchases based on recipes from chefkoch.de and then mine association rules in this purchases.
This is the code for the BA-Thesis "Analyzing anonymized buying habits.".
The main goal is to investigate if analysis of buying habits is possible using data that completely maintains the anonymity of the buyers.
The code contians tool used to generate purchases based on recipes from chefkoch.de and then mine association rules in this purchases and all other scripts used to generate data for the paper.
![screenshot](./paper/images/web_screenshot.png?raw=true "anonimized_purchase_data")
Installation
-------------
To use the command-line interface just make sure all sql files are in the configured directories and type `python3 assoc_rules.py`.
Using the web interface:
To use the web component you need to install [flask](http://flask.pocoo.org/) - a micro web framework for Python.
- It is recommended to run everything in a **virtual environment** [venv](https://docs.python.org/3/tutorial/venv.html) where you can install **flask** and run the whole project - **OPT A**. Alternatively flask can be installed globally and everything can be run from ./web_component, **OPT B**.
......@@ -33,3 +35,5 @@ To verify that Flask is correctly installed run `pip3 list` and see it listed am
- the path to the sql data files (chefkoch.w.data.500.db or chefkoch.w.data.200.db) and to the results output folder need to be set via `sql_data_dir` and `results_dir` variables in `./core/assoc_rule_finder.py`.
- run the app with `python3 app.py`
- for **OPT A** type `deactivate` at any time to exit the virtual environment;
![screenshot](./paper/images/web_screenshot.png?raw=true "anonimized_purchase_data")
\ No newline at end of file
from core import assoc_rules_find
from core.assoc_rules_find import min_support_default
from core import assoc_rules_finder
from core.assoc_rules_finder import min_support_default
def menu():
......@@ -25,18 +25,18 @@ def menu():
else:
print_minimal = False
assoc_rules_find.filter_items(min_support) # remove items with support < min_supp
assoc_rules_finder.filter_items(min_support) # remove items with support < min_supp
# action = input("find association (r)ules or e(x)it: ")
action = 'r'
if action == 'r':
assoc_rules_find.find_assoc_rules(print_minimal)
assoc_rules_finder.find_assoc_rules(print_minimal)
elif action == 'x':
exit()
menu()
assoc_rules_find.start(True)
assoc_rules_finder.start(True)
menu()
This diff is collapsed.
......@@ -228,10 +228,20 @@ class FPTree:
def print_root(self):
print(self.__root)
def return_all_nodes(self):
def return_all_nodes(self, per_level=False):
all_nodes = []
for node in self.tmp_nodes:
all_nodes.append(node.to_dict())
if per_level:
level = 0
while level <= self.height:
level_nodes = []
nodes = self.get_nodes(level)
for node in nodes:
level_nodes.append(node.to_dict())
level += 1
all_nodes.append(level_nodes)
else:
for node in self.tmp_nodes:
all_nodes.append(node.to_dict())
return all_nodes
'''
......
......@@ -43,7 +43,8 @@ class Node:
return self.__children
def to_dict(self):
return {'val': self.value, 'supp': self.support, 'p': self.parent, 'lvl': self.level, 'nxt': self.next_node_id}
return {'id': self.nid, 'val': self.value, 'supp': self.support, 'p': self.parent, 'c': self.level,
'lvl': self.level, 'nxt': self.next_node_id}
def __repr__(self):
if self.print_minimal:
......
......@@ -14,10 +14,10 @@ class Util:
@staticmethod
def fp_powerset(a_list, min_elem_nr=1, max_elem_nr=None):
print(min_elem_nr)
exit()
from itertools import chain, permutations, combinations
s = list(a_list)
if not min_elem_nr:
min_elem_nr = 1
if not max_elem_nr:
max_elem_nr = len(s) + 1
return chain.from_iterable(combinations(s, r) for r in range(min_elem_nr, max_elem_nr))
......
This diff is collapsed.
paper/images/web_screenshot.png

189 KB | W: | H:

paper/images/web_screenshot.png

190 KB | W: | H:

paper/images/web_screenshot.png
paper/images/web_screenshot.png
paper/images/web_screenshot.png
paper/images/web_screenshot.png
  • 2-up
  • Swipe
  • Onion skin
paper/images/web_screenshot_k.png

376 KB | W: | H:

paper/images/web_screenshot_k.png

374 KB | W: | H:

paper/images/web_screenshot_k.png
paper/images/web_screenshot_k.png
paper/images/web_screenshot_k.png
paper/images/web_screenshot_k.png
  • 2-up
  • Swipe
  • Onion skin
from web_app import app as application
if __name__ == "__main__":
application.run(port=8080, host='0.0.0.0', debug=True)
'''
to start just run:
python app.py OR python3 app.py
'''
\ No newline at end of file
from core import assoc_rules_finder
from core.assoc_rules_finder import min_support_default
def menu():
# a if condition else b
min_support_in = input("\nplease input a min_support [in % or 1 to 10] OR x to exit: ")
min_support = min_support_default
try:
min_support_in = int(min_support_in)
if min_support_in is not None and 0 < min_support_in <= 100:
min_support = min_support_in
'''elif min_support_in is not None and 10 < min_support_in <= 100:
min_support = (min_support_in/10)'''
# confidence = input("please provide confidence: ")
except ValueError:
if min_support_in == 'x':
exit()
print('Invalid number entered, using default min_support: {}'.format(min_support_default))
print_minimal = input("minimal tree print [y/n] (def:n): ")
print_minimal = str(print_minimal)
if print_minimal is 'y' or print_minimal is 'Y':
print_minimal = True
else:
print_minimal = False
assoc_rules_finder.filter_items(min_support) # remove items with support < min_supp
# action = input("find association (r)ules or e(x)it: ")
action = 'r'
if action == 'r':
assoc_rules_finder.find_assoc_rules(print_minimal)
elif action == 'x':
exit()
menu()
assoc_rules_finder.start(True)
menu()
This diff is collapsed.
from core.fp_tree import FPTree
class CFPTree(FPTree):
def __init__(self):
FPTree.__init__(self)
self.end_tbl = []
'''
overrides the parent.add_branch() method
also adding the end node nid to the end table
'''
def add_branch(self, transaction, node, level=1, counter=1):
new_node = FPTree.add_branch(self, transaction, node, level, counter)
if new_node:
self.end_tbl.append(new_node.nid)
'''
overrides the parent.draw()
also printing the end table
'''
def draw(self, minimal=False):
print(self.colors.colorize('\ncond FP Tree:', self.colors.UNDERLINE))
FPTree.draw(self)
self.util.print(self.colors.colorize('end_tbl: {0}\n'.format(self.end_tbl)))
'''
:return []
retrieves all pats from the cond FP tree. Adds all passing paths [support>min_supp] to the list of freq_patterns
'''
def get_all_paths(self, min_support, item):
freq = []
infreq_nodes = []
infreq_values = {}
for entry in self.end_tbl:
node = self.get_node_by_nid(entry)
if len(node.get_children()) > 0:
continue
freq_branch = dict()
while node.parent:
self.util.print('{} |{}'.format(node.value, node.support))
if node.support >= min_support:
if node.support not in freq_branch.keys():
freq_branch[node.support] = [item]
for key in freq_branch.keys():
freq_branch[key].append(node.value)
# if node support is less than min_supp add the value to a temp dict. Check all items at the end, maybe
# added support from different branches is >= than min_support
else:
if node.nid not in infreq_nodes:
if node.value not in infreq_values.keys():
infreq_values[node.value] = node.support
else:
infreq_values[node.value] += node.support
infreq_nodes.append(node.nid)
node = self.get_node_by_nid(node.parent)
if len(freq_branch) > 0 and freq_branch not in freq:
freq.append(freq_branch)
self.util.print('CFPTree.get_all_paths(infreq_vals): {}'.format(infreq_values)) #@todo: return this?
# go over the items who didn't pass the min_support test, see if their sum does
for val, item_freq in infreq_values.items():
if item_freq >= min_support:
self.util.print('adding infreq_items {}:[{}, {}]'.format(item_freq, item, val))
freq.append({item_freq: [item, val]})
if len(freq) == 1:
return freq[0]
return freq
from core.util import Util
from core.util import BColors
from core.node import Node
class FPTree:
def __init__(self):
self.__root = None
self.nr_of_nodes = 0
self.current_level = 0
self.height = 0
self.hdr_table = {}
self.tmp_nodes = []
self.util = Util(False)
self.colors = BColors()
'''
inserts a transaction into the tree
IF there is no node, a root is created and the transaction added as a branch.
ELSE as long as the same paths exists the counters [support] are incremented
'''
def insert(self, transaction, counter=1):
self.util.print(self.colors.colorize('inserting: {}'.format(transaction), self.colors.GR))
if not self.__root:
self.__root = self.new_node('root')
self.add_branch(transaction, self.__root, 1, counter)
else:
self.current_level = 0
parent_node = self.__root
rest_transaction = list(transaction)
for item in transaction:
self.current_level += 1
found = False
self.util.print('look for {} at {}'.format(item, self.current_level))
for child in parent_node.get_children():
if child.value == item:
self.util.print(self.colors.colorize('found {}'.format(child) +
'lvl {} and val {}'.format(self.current_level, item)))
child.support += counter
rest_transaction.remove(item)
parent_node = child
found = True
break
if not found:
break
self.util.print('rest transaction: {}'.format(rest_transaction))
self.add_branch(rest_transaction, parent_node, self.current_level, counter)
'''
adds a branch (from a transaction) to the FP tree
'''
def add_branch(self, transaction, node, level=1, counter=1):
if len(transaction) > 0 and node is not None:
self.util.print('add_branch() to {}'.format(node))
i = 0
prev_node = node
while i < len(transaction):
a_node = self.new_node(transaction[i], level, counter)
if prev_node is not None:
prev_node.add_child(a_node)
a_node.add_parent(prev_node.nid)
prev_node = a_node
i += 1
level += 1
return prev_node
return None
'''
updated the header table after each node insert
'''
def update_hdr_table(self, value, new_node):
if value not in self.hdr_table:
self.hdr_table[value] = new_node.nid
else:
next_node_id = self.hdr_table[value]
next_node = self.get_node_by_nid(next_node_id)
while next_node_id is not 0:
next_node = self.get_node_by_nid(next_node_id)
next_node_id = next_node.next_node_id
next_node.next_node_id = new_node.nid
'''
:return node
return a node by nid
'''
def get_node_by_nid(self, nid):
for node in self.tmp_nodes:
if node.nid is nid:
# print(node)
return node
return None
'''
:return node
returns the root node
'''
def get_root(self):
return self.__root
'''
:return []
returns a list of all nodes for a given level
'''
def get_nodes(self, level):
return [node for node in self.tmp_nodes
if node.level is level]
'''
:return int
returns the count of all nodes for a given level
'''
def get_nodes_count(self, level):
count = 0
for node in self.tmp_nodes:
if node.level == level:
count += 1
return count
'''
:return node
creates a new node
'''
def new_node(self, value=None, level=0, counter=1, nid=-1):
try:
if nid is -1:
nid = self.nr_of_nodes + 1
node = Node(nid, value, counter, level)
self.nr_of_nodes += 1
self.tmp_nodes.append(node)
self.update_hdr_table(value, node)
if level > self.height:
self.height = level
return node
except Exception:
print('err: new_node()')
return None
'''
returns the path from a given node up to the root
'''
def get_path_up(self, node, info=False):
path = []
while node is not self.__root:
node = self.get_node_by_nid(node.parent)
if node is self.__root:
break
if info is 'val':
path.append(node.value)
elif info is 'nid':
path.append(node.nid)
else:
path.append(node)
return path
'''
draws the tree to the CLI
'''
def draw(self, minimal=False):
from operator import attrgetter
self.tmp_nodes.sort(key=attrgetter('level', 'parent'))
level = 0
max_width = 0
max_width_lvl = 0
if minimal:
spacer = ' ' * 12
spacer2 = ' ' * 22
spacer3 = ' ' * 8
spacer4 = ' ' * 4
else:
spacer = ' ' * 18
spacer2 = ' ' * 36
spacer3 = ' ' * 15
spacer4 = ' ' * 0
while level <= self.height:
width = self.get_nodes_count(level)
if width > max_width:
max_width = width
max_width_lvl = level
level += 1
level = 0
while level <= self.height:
nodes = self.get_nodes(level)
level_spacer = int((max_width-len(nodes))/2)
row1 = spacer2*level_spacer
row2 = spacer2*level_spacer
for node in nodes:
node.print_minimal = minimal
row1 += ' {} {}'.format(node, spacer4)
children = node.get_children()
children_nr = len(children)
if children_nr > 2:
# row1 = spacer2 + row1
row2 += spacer3 + ' /|\ '
elif len(children) > 1:
# row1 = spacer + row1
row2 += spacer3 + ' / \ '
elif len(children) > 0:
row2 += spacer3 + ' | '
row2 += spacer
node.print_minimal = False
print(row1)
print(row2)
level += 1
print(self.colors.colorize('hdr_tbl:{}'.format(self.hdr_table)))
'''
prints tree in order
'''
def print_inorder(self):
self.__print_inorder_r(self.__root)
'''
prints the current node and recursively calls itself for all current node children
'''
def __print_inorder_r(self, current_node):
if not current_node:
return
for child in current_node.get_children():
self.__print_inorder_r(child)
print(current_node)
'''
prints just the root node
'''
def print_root(self):
print(self.__root)
def return_all_nodes(self, per_level=False):
all_nodes = []
if per_level:
level = 0
while level <= self.height:
level_nodes = []
nodes = self.get_nodes(level)
for node in nodes:
level_nodes.append(node.to_dict())
level += 1
all_nodes.append(level_nodes)
else:
for node in self.tmp_nodes:
all_nodes.append(node.to_dict())
return all_nodes
'''
'''
def __repr__(self):
output = 'leaf: {}, height: {}\n'.format(len(self.tmp_nodes), self.height)
if self.__root is not None:
for node in self.tmp_nodes:
output += '{}\n'.format(node)
output += 'hdr_tbl: {}'.format(self.hdr_table)
return output
class Node:
"""
"""
def __init__(self, nid=0, value=None, support=0, level=0, children=None):
self.nid = nid
self.value = value
self.support = support
self.parent = None
if children is None:
self.__children = []
else:
self.__children = children
self.next_node_id = 0
self.level = level
self.print_minimal = False
'''
increases the counter [support] propriety of the node
'''
def increase_counter(self):
self.support += 1
'''
adds a child node to a node
'''
def add_child(self, node):
if isinstance(node, Node) or node is None:
self.__children.append(node)
'''
sets the parent propriety of a node
'''
def add_parent(self, nid):
if nid is not None:
self.parent = nid
'''
:return []
returns all children of a node
'''
def get_children(self):
return self.__children
def to_dict(self):
return {'id': self.nid, 'val': self.value, 'supp': self.support, 'p': self.parent, 'c': self.level,
'lvl': self.level, 'nxt': self.next_node_id}
def __repr__(self):
if self.print_minimal:
return "{}.[{}|{} p:{} -->{}]".format(self.nid, self.value, self.support, self.parent, self.next_node_id)
return "{}.[{}|{} p:{} #cld:{} LVL:{} nxt_lnk:{}]".format(self.nid, self.value, self.support, self.parent,
len(self.__children), self.level, self.next_node_id)
class Util:
def __init__(self, debug=False):
self.debug = debug
def print(self, text):
if self.debug:
print(text)
@staticmethod
def print_list(a_list):
print("list w/ len {}:".format(len(a_list)))
for k, v in a_list.items():
print("{} -> {} ".format(k, v))
@staticmethod
def fp_powerset(a_list, min_elem_nr=1, max_elem_nr=None):
from itertools import chain, permutations, combinations
s = list(a_list)
if not min_elem_nr:
min_elem_nr = 1
if not max_elem_nr:
max_elem_nr = len(s) + 1
return chain.from_iterable(combinations(s, r) for r in range(min_elem_nr, max_elem_nr))
class BColors:
HDR = '\033[95m'
BL = '\033[34m'
CYAN = '\033[96m'
GR = '\033[92m'
WARN = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
LIBL = '\033[94m'
def colorize(self, text=None, color=None):
if not color:
color = self.CYAN
text = color + text + self.ENDC
return text
# Print iterations progress
def print_progress(iteration, total, prefix='', suffix='', decimals=1, bar_length=100):
import sys
sys.stdout.flush()
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)