[SOLVED] How to deal with complex list and dictionary on python

Issue

I’ve been working on python to make a program which need to handle complex problem from list of dict. The thing is I need to transform this data into dictionary and sort it. The input for this function is come from trees. The code I share here is working, but takes a long time to run. In here I wanna ask is there any idea to make this function run more faster in python? I use python 3.7.3 if you ask. The reason I wanna improve this code is because when I tried to make input data for this function need around 3-4 hours, but to run this function need time around 21-22 hours (this really shock me).

here is the structure of data that I input on below:

[ # list of trees
    [ # list of leaf nodes
        { # dict of spg that contain in leaf
            (tau_1,tau_2):[(ent1,ent2),(ent1,ent3)]
        }
    ]
]

this is the sample data that I work on:

[
    [
        {(7, 8): [(1, 28), (1, 29)]},
        {(7, 8): [(1, 30)]},
        {(7, 8): [(1, 32)]},
        {(7, 8): [(1, 21), (1, 22)]},
        {(7, 8): [(1, 18)]},
        {(7, 8): [(1, 19), (1, 31)]},
        {(7, 8): [(1, 13), (1, 14)]},
        {(7, 8): [(1, 16)]},
        {(7, 8): [(1, 15)]},
        {(7, 8): [(1, 11), (1, 12)]},
        {(7, 8): [(1, 17)]},
        {(7, 8): [(1, 23), (1, 26)]},
        {(7, 8): [(1, 20), (1, 27)]},
        {(7, 8): [(1, 7)]},
        {(7, 8): [(1, 9), (1, 10)]},
        {(7, 8): [(1, 8)]},
        {(7, 8): [(1, 5), (1, 6)]},
        {(7, 8): [(1, 24), (1, 25)]},
        {
            (0, 1): [(1, 2)],
            (1, 2): [(1, 2)],
            (2, 3): [(1, 2)],
            (3, 4): [(1, 2)],
            (4, 5): [(1, 2)],
            (5, 6): [(1, 2)],
            (6, 7): [(1, 2)],
            (7, 8): [(1, 2)],
        },
        {
            (0, 1): [(1, 3), (1, 4)],
            (1, 2): [(1, 3), (1, 4)],
            (2, 3): [(1, 3), (1, 4)],
            (3, 4): [(1, 3), (1, 4)],
            (4, 5): [(1, 3), (1, 4)],
            (5, 6): [(1, 3), (1, 4)],
            (6, 7): [(1, 3), (1, 4)],
            (7, 8): [(1, 3), (1, 4)],
        },
    ],
    [
        {(7, 8): [(2, 28), (2, 29)]},
        {(7, 8): [(2, 30)]},
        {(7, 8): [(2, 32)]},
        {(7, 8): [(2, 21), (2, 22)]},
        {(7, 8): [(2, 18)]},
        {(7, 8): [(2, 19), (2, 31)]},
        {(7, 8): [(2, 13), (2, 14)]},
        {(7, 8): [(2, 16)]},
        {(7, 8): [(2, 24), (2, 25)]},
        {(7, 8): [(2, 23)]},
        {(7, 8): [(2, 26), (2, 27)]},
        {(7, 8): [(2, 20)]},
        {(7, 8): [(2, 15)]},
        {(7, 8): [(2, 11), (2, 12)]},
        {(7, 8): [(2, 17)]},
        {(7, 8): [(2, 9), (2, 10)]},
        {(7, 8): [(2, 1)]},
        {(7, 8): [(2, 3), (2, 4)]},
        {(7, 8): [(2, 7), (2, 8)]},
        {(7, 8): [(2, 5), (2, 6)]},
    ]
]

this is the parameters value in case you need to know:

tau_lenght: 50
ent: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
gamma: 1
tau_start: [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99]
tau_end: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100]
time_range: 2
json_output: 'output_json'

Here the code:

import json
def prepare_output(data,tau_lenght,ent,gamma,tau_start,tau_end,time_range,json_output):
    # step 1:
    # change data to dict where pair of tau became key
    # list of group of entity into sorted based on o_r
    spgs = {} # collect all spg per 2 tau based on entity
    for tau in range(tau_lenght-1): # for every tau
        temp_tau = [] # containing result per 2 tau
        for o_r in ent: # for every o_r
            temp = [] # contain all SPG whic contain o_r on tau
            for lst in data: # lst is the result on every decision tree
                for leaf in lst: # leaf from decision tree
                    if (tau,tau+1) in leaf.keys(): # looking for pair tau existence
                        # if leaf contain entity o_r
                        if list_contain(leaf[(tau,tau+1)],o_r):
                            # adding new crew candidate
                            # helper.list_of_ent is working for change [(1,2),(2,3)] into [1,2,3]
                            temp.append(helper.list_of_ent(leaf[(tau,tau+1)]))
            if temp: # if temp not empty
                temp_tau.append(temp) # adding temp into temp_tau
        if temp_tau: # if temp_tau not empty
            # connect all candidate into one spg
            cur_tau = []
            for lst in temp_tau:
                for pairs in lst:
                    cur_tau.append(pairs)
            spgs[(tau,tau+1)] = set(tuple(i) for i in cur_tau) # menambahkan 
            # temp_tau into res
    
    # step 2:
    # change into dict which include group of ent as key,
    # and list of (tau1,tau2) as value
    result_per_group = {}
    for tau in spgs.keys():
        for group in spgs[tau]:
            if group in result_per_group.keys():
                result_per_group[group].append(tau)
            else:
                result_per_group[group] = [tau]
    
    # step 3:
    # remove all candidate > gamma
    for group in result_per_group.keys():
        lst = result_per_group[group]
        for tau_idx in range(len(lst)-1):
            tau1_end = tau_end[tau_idx]
            tau2_start = tau_start[tau_idx+1]
            if (tau2_start - tau1_end) > (time_range * gamma):
                result_per_group.pop(group)
                break
    
    global output
    # helper.dict_str_key is fuction I made to convert key of dict into string
    output['output'] = helper.dict_str_key(result_per_group)
    # to convert into json file
    helper.to_json(json_output,output)

# this some function from helper that I use in here
def list_of_ent(lst):
    res = set()
    for e1,e2 in lst:
        res.add(e1)
        res.add(e2)
    return sorted(list(res))

def dict_str_key(data):
    data_str_key = {}
    for key in data.keys():
        data_str_key[str(key)] = data[key]
    return data_str_key

def to_json(name,data):
    name = 'output/'+name+'.json'
    with open(name, 'w') as file:
        json.dump(data, file)

To make question clear, I put the code focus on three big loop as steps.

step 1 used to make output like this:

{
    (tau1, tau2): {
        (ent1,ent2),
        (ent8,ent19)
    }
}

step 2 used to make the output like this:

{
    (ent1,ent2,ent3):[(tau1,tau2),(tau2,tau3)]
}

step 3 have the same format for output step 2

p.s: tau is group of time (in this example I use 2 frame as 1 tau)

There’s the question I wanna ask, sorry if I say it too long or the problem is kinda too complex. I hope my question is clear. I really look forward for any responds and support, thanks.

Solution

Without having the full code to test outputs this is harder to do, but it seems that there are some redundant processes that you are adding elements to a list of lists only to flatten that list and add that to a dictionary as a set. You can increase some of the speed and memory by removing that and instead just adding it to the dictionary right away.

There are some other tweaks that can be done such as using f-strings instead of string concatenation, using list comprehension, and removing having to do the same math in the loop (time_range * gamma) and instead just reference it by memory.

But these are all minor tweaks compared to your step one process which looks to be the largest time sink (approx N^4 in time complexity). I am unsure if it is larger as I don’t see the functions that you use inside that for loop, but tweaking that to reduce the number of calculations would provide the largest benefit to time savings.

import json
from collections import defaultdict

def prepare_output(data,tau_lenght,ent,gamma,tau_start,tau_end,time_range,json_output):
    # step 1:
    # change data to dict where pair of tau became key
    # list of group of entity into sorted based on o_r
    spgs = defaultdict(set) # collect all spg per 2 tau based on entity
    for tau in range(tau_lenght-1): # for every tau
        for o_r in ent: # for every o_r
            for lst in data: # lst is the result on every decision tree
                for leaf in lst: # leaf from decision tree
                    if (tau,tau+1) in leaf: # looking for pair tau existence
                        # if leaf contain entity o_r
                        if list_contain(leaf[(tau,tau+1)],o_r):
                            # adding new crew candidate
                            # helper.list_of_ent is working for change [(1,2),(2,3)] into [1,2,3]
                            list_unique_elements = helper.list_of_ent(leaf[(tau,tau+1)])
                            if (tau,tau+1) in spgs: # menambahkan 
                                dict[(tau,tau+1)].add(tuple(list_unique_elements))
                            else:
                                dict[(tau,tau+1)] = {tuple(list_unique_elements)}
    
    # step 2:
    # change into dict which include group of ent as key,
    # and list of (tau1,tau2) as value
    result_per_group = {}
    for tau, value in spgs.items():
        for group in value:
            if group in result_per_group:
                result_per_group[group].append(tau)
            else: 
                result_per_group[group] = [tau]
    
    # step 3:
    # remove all candidate > gamma
    time_gamma_multiplied = time_range * gamma
    for group,lst in result_per_group.items():
        for tau_idx in lst[:-1]:
            if (tau_start[tau_idx+1] - tau_end[tau_idx]) > time_gamma_multiplied:
                result_per_group.pop(group)
                break
    
    global output
    # helper.dict_str_key is fuction I made to convert key of dict into string
    output['output'] = helper.dict_str_key(result_per_group)
    # to convert into json file
    helper.to_json(json_output,output)

# this some function from helper that I use in here
def list_of_ent(lst):
    res = set()
    for e1,e2 in lst:
        res.add(e1)
        res.add(e2)
    return sorted(list(res))

def dict_str_key(data):
    return {data_str_key[str(key)]: value for key, value in data.items()}

def to_json(name,data):
    with open(f'output/{name}.json', 'w') as file:
        json.dump(data, file)

Obviously profiling your code would give you the best indicator for what is taking the most time and to find processes that are either redundant or that take a large time and could be reduced with minor tweaks.

Answered By – Andrew Ryan

Answer Checked By – Marie Seifert (BugsFixing Admin)

Leave a Reply

Your email address will not be published. Required fields are marked *