ScoreWalker/scorewalker-utils/AccuracyCounter/PageReduce.py
2025-03-13 00:13:53 -06:00

87 lines
3.5 KiB
Python

matches_list = [{'first_name': 'page00001', 'second_name': 'page00004'},
{'first_name': 'page00004', 'second_name': 'page00001'},
{'first_name': 'page00004', 'second_name': 'page00002'},
{'first_name': 'page00001', 'second_name': 'page00002'},
{'first_name': 'page00101', 'second_name': 'page00005'},
{'first_name': 'page00005', 'second_name': 'page00101'},
{'first_name': 'page00005', 'second_name': 'page00030'},
{'first_name': 'page00050', 'second_name': 'page00030'},
{'first_name': 'page00030', 'second_name': 'page00050'}]
print('%d Matches before reduce' % len(matches_list))
[print(' ~ %s -> %s' % (m['first_name'], m['second_name'])) for m in matches_list]
print('')
def page_reduce():
result = []
# We need to keep track of what we have seen...
seen = []
# I assume the data you get back is iterable in some way...
for item in matches_list:
# I don't remember what these values are in the data stream you get back
page_1 = item['first_name']
page_2 = item['second_name']
# We need to order the pages to guarantee not adding duplicates.
# These values can be "page<number>", lexicographical ordering will do the rest
# should page_1 be first?
if page_1 < page_2:
key = '%s-%s' % (page_1, page_2)
# Reverse the order otherwise.
else:
key = '%s-%s' % (page_2, page_1)
# Now that we guaranteed that "page00001" -> "page00004" and "page00004" -> "page00001" produce the same key
# we can check for the keys existence, and add it if it isn't there.
if key not in seen:
seen.append(key)
result.append(item)
else:
print('Found duplicate of %s at index %d' % (key, matches_list.index(item)))
# here is the result...
return result
reduced = page_reduce()
print('')
print('%d Matches after reduce (%d were duplicates)' % (len(reduced), len(matches_list) - len(reduced)))
[print(' ~ %s -> %s' % (m['first_name'], m['second_name'])) for m in reduced]
def page_group():
# The full match sets
match_sets = []
# go through the list of items
for item in matches_list:
page_1 = item['first_name']
page_2 = item['second_name']
# We need to keep track of weather or not we found a "parent set"
found = False
for i in range(len(match_sets)):
m_set = match_sets[i]
pages = m_set.split('-')
# If either page one or page two are in the current match set
if page_1 in pages or page_2 in pages:
# We only need to add the one which isn't already in the set (it's possible both are)
# NOTE: The sets here do not have to be in order, since we are not looking to remove duplicates.
if page_1 not in pages:
m_set = '%s-%s' % (m_set, page_1)
elif page_2 not in pages:
m_set = '%s-%s' % (m_set, page_2)
# We found it!
found = True
# Set the element
match_sets[i] = m_set
# If we didn't find it, we should add an element
if not found:
match_sets.append('%s-%s' % (page_1, page_2))
return match_sets
full_sets = page_group()
print('')
print('%d Match sets (%d were transitive)' % (len(full_sets), len(matches_list) - len(full_sets)))
[print(' ~ %s' % m) for m in full_sets]