ScoreWalker/scorewalker-utils/AccuracyCounter/PageReduce.py


matches_list = [{'first_name': 'page00001', 'second_name': 'page00004'},
                {'first_name': 'page00004', 'second_name': 'page00001'},
                {'first_name': 'page00004', 'second_name': 'page00002'},
                {'first_name': 'page00001', 'second_name': 'page00002'},
                {'first_name': 'page00101', 'second_name': 'page00005'},
                {'first_name': 'page00005', 'second_name': 'page00101'},
                {'first_name': 'page00005', 'second_name': 'page00030'},
                {'first_name': 'page00050', 'second_name': 'page00030'},
                {'first_name': 'page00030', 'second_name': 'page00050'}]


print('%d Matches before reduce' % len(matches_list))
[print('  ~ %s -> %s' % (m['first_name'], m['second_name'])) for m in matches_list]
print('')


def page_reduce():
    result = []
    # We need to keep track of what we have seen...
    seen = []
    # I assume the data you get back is iterable in some way...
    for item in matches_list:
        # I don't remember what these values are in the data stream you get back
        page_1 = item['first_name']
        page_2 = item['second_name']

        # We need to order the pages to guarantee not adding duplicates.
        # These values can be "page<number>", lexicographical ordering will do the rest
        # should page_1 be first?
        if page_1 < page_2:
            key = '%s-%s' % (page_1, page_2)
        # Reverse the order otherwise.
        else:
            key = '%s-%s' % (page_2, page_1)
        # Now that we guaranteed that "page00001" -> "page00004" and "page00004" -> "page00001" produce the same key
        # we can check for the keys existence, and add it if it isn't there.
        if key not in seen:
            seen.append(key)
            result.append(item)
        else:
            print('Found duplicate of %s at index %d' % (key, matches_list.index(item)))
    # here is the result...
    return result


reduced = page_reduce()
print('')
print('%d Matches after reduce (%d were duplicates)' % (len(reduced), len(matches_list) - len(reduced)))
[print('  ~ %s -> %s' % (m['first_name'], m['second_name'])) for m in reduced]


def page_group():
    # The full match sets
    match_sets = []
    # go through the list of items
    for item in matches_list:
        page_1 = item['first_name']
        page_2 = item['second_name']
        # We need to keep track of weather or not we found a "parent set"
        found = False
        for i in range(len(match_sets)):
            m_set = match_sets[i]
            pages = m_set.split('-')
            # If either page one or page two are in the current match set
            if page_1 in pages or page_2 in pages:
                # We only need to add the one which isn't already in the set (it's possible both are)
                # NOTE: The sets here do not have to be in order, since we are not looking to remove duplicates.
                if page_1 not in pages:
                    m_set = '%s-%s' % (m_set, page_1)
                elif page_2 not in pages:
                    m_set = '%s-%s' % (m_set, page_2)
                # We found it!
                found = True
                # Set the element
                match_sets[i] = m_set
        # If we didn't find it, we should add an element
        if not found:
            match_sets.append('%s-%s' % (page_1, page_2))

    return match_sets

full_sets = page_group()
print('')
print('%d Match sets (%d were transitive)' % (len(full_sets), len(matches_list) - len(full_sets)))
[print('  ~ %s' % m) for m in full_sets]