87 lines
3.5 KiB
Python
87 lines
3.5 KiB
Python
|
|
matches_list = [{'first_name': 'page00001', 'second_name': 'page00004'},
|
|
{'first_name': 'page00004', 'second_name': 'page00001'},
|
|
{'first_name': 'page00004', 'second_name': 'page00002'},
|
|
{'first_name': 'page00001', 'second_name': 'page00002'},
|
|
{'first_name': 'page00101', 'second_name': 'page00005'},
|
|
{'first_name': 'page00005', 'second_name': 'page00101'},
|
|
{'first_name': 'page00005', 'second_name': 'page00030'},
|
|
{'first_name': 'page00050', 'second_name': 'page00030'},
|
|
{'first_name': 'page00030', 'second_name': 'page00050'}]
|
|
|
|
|
|
print('%d Matches before reduce' % len(matches_list))
|
|
[print(' ~ %s -> %s' % (m['first_name'], m['second_name'])) for m in matches_list]
|
|
print('')
|
|
|
|
|
|
def page_reduce():
|
|
result = []
|
|
# We need to keep track of what we have seen...
|
|
seen = []
|
|
# I assume the data you get back is iterable in some way...
|
|
for item in matches_list:
|
|
# I don't remember what these values are in the data stream you get back
|
|
page_1 = item['first_name']
|
|
page_2 = item['second_name']
|
|
|
|
# We need to order the pages to guarantee not adding duplicates.
|
|
# These values can be "page<number>", lexicographical ordering will do the rest
|
|
# should page_1 be first?
|
|
if page_1 < page_2:
|
|
key = '%s-%s' % (page_1, page_2)
|
|
# Reverse the order otherwise.
|
|
else:
|
|
key = '%s-%s' % (page_2, page_1)
|
|
# Now that we guaranteed that "page00001" -> "page00004" and "page00004" -> "page00001" produce the same key
|
|
# we can check for the keys existence, and add it if it isn't there.
|
|
if key not in seen:
|
|
seen.append(key)
|
|
result.append(item)
|
|
else:
|
|
print('Found duplicate of %s at index %d' % (key, matches_list.index(item)))
|
|
# here is the result...
|
|
return result
|
|
|
|
|
|
reduced = page_reduce()
|
|
print('')
|
|
print('%d Matches after reduce (%d were duplicates)' % (len(reduced), len(matches_list) - len(reduced)))
|
|
[print(' ~ %s -> %s' % (m['first_name'], m['second_name'])) for m in reduced]
|
|
|
|
|
|
def page_group():
|
|
# The full match sets
|
|
match_sets = []
|
|
# go through the list of items
|
|
for item in matches_list:
|
|
page_1 = item['first_name']
|
|
page_2 = item['second_name']
|
|
# We need to keep track of weather or not we found a "parent set"
|
|
found = False
|
|
for i in range(len(match_sets)):
|
|
m_set = match_sets[i]
|
|
pages = m_set.split('-')
|
|
# If either page one or page two are in the current match set
|
|
if page_1 in pages or page_2 in pages:
|
|
# We only need to add the one which isn't already in the set (it's possible both are)
|
|
# NOTE: The sets here do not have to be in order, since we are not looking to remove duplicates.
|
|
if page_1 not in pages:
|
|
m_set = '%s-%s' % (m_set, page_1)
|
|
elif page_2 not in pages:
|
|
m_set = '%s-%s' % (m_set, page_2)
|
|
# We found it!
|
|
found = True
|
|
# Set the element
|
|
match_sets[i] = m_set
|
|
# If we didn't find it, we should add an element
|
|
if not found:
|
|
match_sets.append('%s-%s' % (page_1, page_2))
|
|
|
|
return match_sets
|
|
|
|
full_sets = page_group()
|
|
print('')
|
|
print('%d Match sets (%d were transitive)' % (len(full_sets), len(matches_list) - len(full_sets)))
|
|
[print(' ~ %s' % m) for m in full_sets]
|