matches_list = [{'first_name': 'page00001', 'second_name': 'page00004'}, {'first_name': 'page00004', 'second_name': 'page00001'}, {'first_name': 'page00004', 'second_name': 'page00002'}, {'first_name': 'page00001', 'second_name': 'page00002'}, {'first_name': 'page00101', 'second_name': 'page00005'}, {'first_name': 'page00005', 'second_name': 'page00101'}, {'first_name': 'page00005', 'second_name': 'page00030'}, {'first_name': 'page00050', 'second_name': 'page00030'}, {'first_name': 'page00030', 'second_name': 'page00050'}] print('%d Matches before reduce' % len(matches_list)) [print(' ~ %s -> %s' % (m['first_name'], m['second_name'])) for m in matches_list] print('') def page_reduce(): result = [] # We need to keep track of what we have seen... seen = [] # I assume the data you get back is iterable in some way... for item in matches_list: # I don't remember what these values are in the data stream you get back page_1 = item['first_name'] page_2 = item['second_name'] # We need to order the pages to guarantee not adding duplicates. # These values can be "page", lexicographical ordering will do the rest # should page_1 be first? if page_1 < page_2: key = '%s-%s' % (page_1, page_2) # Reverse the order otherwise. else: key = '%s-%s' % (page_2, page_1) # Now that we guaranteed that "page00001" -> "page00004" and "page00004" -> "page00001" produce the same key # we can check for the keys existence, and add it if it isn't there. if key not in seen: seen.append(key) result.append(item) else: print('Found duplicate of %s at index %d' % (key, matches_list.index(item))) # here is the result... return result reduced = page_reduce() print('') print('%d Matches after reduce (%d were duplicates)' % (len(reduced), len(matches_list) - len(reduced))) [print(' ~ %s -> %s' % (m['first_name'], m['second_name'])) for m in reduced] def page_group(): # The full match sets match_sets = [] # go through the list of items for item in matches_list: page_1 = item['first_name'] page_2 = item['second_name'] # We need to keep track of weather or not we found a "parent set" found = False for i in range(len(match_sets)): m_set = match_sets[i] pages = m_set.split('-') # If either page one or page two are in the current match set if page_1 in pages or page_2 in pages: # We only need to add the one which isn't already in the set (it's possible both are) # NOTE: The sets here do not have to be in order, since we are not looking to remove duplicates. if page_1 not in pages: m_set = '%s-%s' % (m_set, page_1) elif page_2 not in pages: m_set = '%s-%s' % (m_set, page_2) # We found it! found = True # Set the element match_sets[i] = m_set # If we didn't find it, we should add an element if not found: match_sets.append('%s-%s' % (page_1, page_2)) return match_sets full_sets = page_group() print('') print('%d Match sets (%d were transitive)' % (len(full_sets), len(matches_list) - len(full_sets))) [print(' ~ %s' % m) for m in full_sets]