Slow filling graph

elastica＠laposte.net

24 Nov 2018 24 Nov '18

6:29 p.m.

Hi I was experimenting Kruskal performance against a data file for 100 graphs (n, m<20000) : # ================================================= from time import clock from sys import stderr, stdin import graph_tool as gt from graph_tool.topology import min_spanning_tree import networkx as nx # ------------ NetworkX Code --------------------------- def kruskal_networkX(edges, n): G=nx.Graph() for a, b, w in edges: G.add_edge(a,b,weight=w) return sum(d['weight'] for (_,_,d) in nx.minimum_spanning_tree(G).edges(data=True)) # ------------ Graph_tool Code --------------------------- def kruskal_gt(wedges,n): g= gt.Graph(directed=False) edges, weights= zip(*[((a,b),w) for (a,b,w) in wedges]) weight = g.new_edge_property("long long") for a,b,w in wedges: e=g.add_edge(a,b) weight[e]=w tree=min_spanning_tree(g, weights=weight) return sum(b*weight[e] for (e,b) in zip(g.edges(), tree)) # ------------ Benchmark --------------------------- def go(solver, L): global duration N=len(L) k=1 while k<N: edges=[] n=L[k] k+=1 for a in range(n): d=L[k] k+=1 for j in range(d): b=L[k] k+=1 w=L[k] k+=1 if a<b: edges.append([a,b-1,w]) start=clock() print(solver(edges, n)) print("----------------------------") duration+=clock()-start # data L=list(int(z) for z in stdin.read().split() if z.isdigit()) for solver in [kruskal_networkX, kruskal_gt]: duration=0 go(solver, L) print("%s : %.3f" %(solver.__name__, duration), file=stderr) # ================================================= Outputs are in agreement but gt execution is very slow: $ python3 gt.py < big.in > bench.out kruskal_networkX : 15.721 kruskal_gt : 134.839 the bottleneck being within the following loop-filling the graph: # -------------------------------------------- for a,b,w in wedges: e=g.add_edge(a,b) weight[e]=w # -------------------------------------------- Probably, i'm filling the graph in the wrong way. You can dowload the data file here: https://drive.google.com/file/d/1y0fX1kopgzQEWgmRHRnouWHnMkuliwfw/view?usp=s...

Show replies by date

Alexandre Hannud Abdo

27 Nov 27 Nov

1:52 a.m.

Ni! Hi elastica, In graph-tool you can efficiently add edges with properties using the `add_edge_list` function: https://graph-tool.skewed.de/static/doc/graph_tool.html#graph_tool.Graph.add... I'd also suggest that you benchmark python code with the `cProfile` module, it is much easier and more informative: https://docs.python.org/3/library/profile.html Cheers, ale .~´ On Sat, Nov 24, 2018 at 7:36 PM <elastica@laposte.net> wrote:

...

Hi

I was experimenting Kruskal performance against a data file for 100 graphs (n, m<20000) :

# ================================================= from time import clock from sys import stderr, stdin

import graph_tool as gt from graph_tool.topology import min_spanning_tree

import networkx as nx

# ------------ NetworkX Code ---------------------------

def kruskal_networkX(edges, n): G=nx.Graph() for a, b, w in edges: G.add_edge(a,b,weight=w) return sum(d['weight'] for (_,_,d) in nx.minimum_spanning_tree(G).edges(data=True))

# ------------ Graph_tool Code ---------------------------

def kruskal_gt(wedges,n): g= gt.Graph(directed=False) edges, weights= zip(*[((a,b),w) for (a,b,w) in wedges]) weight = g.new_edge_property("long long") for a,b,w in wedges: e=g.add_edge(a,b) weight[e]=w tree=min_spanning_tree(g, weights=weight) return sum(b*weight[e] for (e,b) in zip(g.edges(), tree))

# ------------ Benchmark ---------------------------

def go(solver, L): global duration

N=len(L) k=1

while k<N: edges=[] n=L[k] k+=1

for a in range(n): d=L[k] k+=1 for j in range(d): b=L[k] k+=1 w=L[k] k+=1 if a<b: edges.append([a,b-1,w]) start=clock() print(solver(edges, n)) print("----------------------------") duration+=clock()-start

# data L=list(int(z) for z in stdin.read().split() if z.isdigit())

for solver in [kruskal_networkX, kruskal_gt]: duration=0 go(solver, L) print("%s : %.3f" %(solver.__name__, duration), file=stderr)

# =================================================

Outputs are in agreement but gt execution is very slow:

$ python3 gt.py < big.in > bench.out kruskal_networkX : 15.721 kruskal_gt : 134.839

the bottleneck being within the following loop-filling the graph:

# -------------------------------------------- for a,b,w in wedges: e=g.add_edge(a,b) weight[e]=w # --------------------------------------------

Probably, i'm filling the graph in the wrong way.

You can dowload the data file here:

https://drive.google.com/file/d/1y0fX1kopgzQEWgmRHRnouWHnMkuliwfw/view?usp=s... _______________________________________________ graph-tool mailing list graph-tool@skewed.de https://lists.skewed.de/mailman/listinfo/graph-tool

elastica

29 Nov 29 Nov

9:20 a.m.

Hi Alexandre and thanks for your useful response The *add_edge_list* internally changes edges ordering to the lexicographic one so in order to set weights, I have to sort the weighted-edge list, adding some little overhead. On the other hand, *add_edge_list()* is much more efficient than adding edges one by one with the add_edge method (about 7 times faster!). Unfortunately, this is still slow, even slower than NetworkX (about 1.2 times slower and we all know that NetworkX highest quality is not speed). Here is the new version: # *********************** from time import clock from sys import stderr, stdin import graph_tool as gt from graph_tool.topology import min_spanning_tree import networkx as nx # ------------ NetworkX Code --------------------------- def kruskal_networkX(edges, n): G=nx.Graph() for a, b, w in edges: G.add_edge(a,b,weight=w) return sum(d['weight'] for (_,_,d) in nx.minimum_spanning_tree(G).edges(data=True)) # ------------ Graph_tool Code --------------------------- def kruskal_gt(wedges,n): wedges.sort() g= gt.Graph(directed=False) edges, weights= zip(*[((a,b),w) for (a,b,w) in wedges]) weight = g.new_edge_property("long long") g.add_edge_list(edges) for e,(_,_,w) in zip(g.edges(), wedges): weight[e]=w tree=min_spanning_tree(g, weights=weight) return sum(b*weight[e] for (e,b) in zip(g.edges(), tree)) # ------------ Benchmark --------------------------- def go(solver, L): global duration N=len(L) k=1 while k<N: edges=[] n=L[k] k+=1 for a in range(n): d=L[k] k+=1 for j in range(d): b=L[k] k+=1 w=L[k] k+=1 if a<b: edges.append([a,b-1,w]) start=clock() print(solver(edges, n)) duration+=clock()-start # data L=list(int(z) for z in stdin.read().split() if z.isdigit()) for solver in [kruskal_networkX, kruskal_gt]: duration=0 go(solver, L) print("----------------------------") print("%s : %.3f" %(solver.__name__, duration), file=stderr) # *********************** $ python3 gt_v2.py < big.in > gt.out kruskal_networkX : 15.851 kruskal_gt : 19.713 -- Sent from: http://main-discussion-list-for-the-graph-tool-project.982480.n3.nabble.com/

Tiago de Paula Peixoto

9:56 a.m.

Am 29.11.18 um 10:20 schrieb elastica:

...

The *add_edge_list* internally changes edges ordering to the lexicographic one so in order to set weights, I have to sort the weighted-edge list, adding some little overhead.

That's not true; no re-ordering is performed by add_edge_list().

...

On the other hand, *add_edge_list()* is much more efficient than adding edges one by one with the add_edge method (about 7 times faster!). Unfortunately, this is still slow, even slower than NetworkX (about 1.2 times slower and we all know that NetworkX highest quality is not speed).

That's because you are looping over the edges _twice_. Take a more careful look at the documentation for add_edge_list(), and you find that it accepts an "eprops" parameters, which lists the edge property maps that need to be filled as well. Just pass your weights to this parameter, and feed it a list of (source, target, weight). For it to be even faster, you can make your edge list be a Numpy array --- in which case the entire loop will be in C++. -- Tiago de Paula Peixoto <tiago@skewed.de>

elastica

1:59 p.m.

Hi, Is the following code correspond to the code you have in mind: ? -- Sent from: http://main-discussion-list-for-the-graph-tool-project.982480.n3.nabble.com/

Tiago de Paula Peixoto

2:12 p.m.

Am 29.11.18 um 14:59 schrieb elastica:

...

Hi,

Is the following code correspond to the code you have in mind:

?

It seems you forgot to actually paste the code. -- Tiago de Paula Peixoto <tiago@skewed.de>

elastica

3:26 p.m.

;) Oops, that's infortunate, sorry for the confusion, yet I pasted the code inside a raw text tag. Again : -- Sent from: http://main-discussion-list-for-the-graph-tool-project.982480.n3.nabble.com/

elastica

3:51 p.m.

Again, doesn't work but the code was visible in the preview. Here the code outside any tag: def kruskal_gt(wedges,n): g= gt.Graph(directed=False) weight = g.new_edge_property("long long") g.add_edge_list(np.array(wedges), eprops=[weight]) tree=min_spanning_tree(g, weights=weight) return sum(b*weight[e] for (e,b) in zip(g.edges(), tree)) -- Sent from: http://main-discussion-list-for-the-graph-tool-project.982480.n3.nabble.com/

Tiago de Paula Peixoto

3:59 p.m.

Am 29.11.18 um 16:51 schrieb elastica:

...

def kruskal_gt(wedges,n): g= gt.Graph(directed=False) weight = g.new_edge_property("long long") g.add_edge_list(np.array(wedges), eprops=[weight]) tree=min_spanning_tree(g, weights=weight) return sum(b*weight[e] for (e,b) in zip(g.edges(), tree))

Yes, this is what I meant. Note that you can still remove the last loop altogether by accessing the property maps as arrays: (tree.a * weight.a).sum() Best, Tiago -- Tiago de Paula Peixoto <tiago@skewed.de>

elastica

30 Nov 30 Nov

6:19 a.m.

...

Note that you can still remove the last loop altogether by accessing the property maps as arrays: (tree.a * weight.a).sum()

This innocuous remark leads in fact to a huge speedup: you gain a factor 13, a lot of surprise! Now, Graph Tool has the best performance among Networkit, Igraph, Scipy and NetworkX of course. However, to put in perspective, Graph Tool is only 2 times faster than pure Python code where Kruskal is implemented with a basic Union-Find, we are far from genuine C/C++ performance, because usually graph implementations are 20 times faster in C/C++ than pure Python. Below, the complete benchmark. Note that Scipy code has the potential to become the faster, the problem with scipy graphs is the need to convert adjacency lists (or edges list) to Compressed Sparse Row and you have to write it in Python, this is VERY slow. # *********************** from time import clock from sys import stderr, stdin import networkit as nk from networkit.graph import RandomMaximumSpanningForest from scipy.sparse.csgraph import minimum_spanning_tree from scipy.sparse import csr_matrix from igraph import Graph import graph_tool as gt from graph_tool.topology import min_spanning_tree import numpy as np # ------------ Pure Python Code --------------------------- def find(s, parents): ups=[s] while True: up=parents[s] if up ==None: break s=up ups.append(s) for v in ups[:-1]: parents[v]=s return s def merge(s, t, parents, heights): rep_s=find(s, parents) rep_t=find(t, parents) if rep_s==rep_t: return False height_s=heights[rep_s] height_t=heights[rep_t] if height_s < height_t: parents[rep_s]=rep_t else: parents[rep_t]=rep_s if height_s==height_t: heights[rep_s]+=1 return True def kruskal_python(edges, n): edges.sort(key=lambda t:t[2]) parents=[None]*n heights=[0]*n k=0 cost=0 cnt=0 while cnt<n-1: a, b, w=edges[k] k+=1 if merge(a,b, parents, heights): cost+=w cnt+=1 return cost # ------------ Graph_tool Code --------------------------- def kruskal_gt_numpy_sum(wedges,n): g= gt.Graph(directed=False) weight = g.new_edge_property("long long") g.add_edge_list(np.array(wedges), eprops=[weight]) tree=min_spanning_tree(g, weights=weight) return (tree.a * weight.a).sum() def kruskal_gt(wedges,n): g= gt.Graph(directed=False) weight = g.new_edge_property("long long") g.add_edge_list(np.array(wedges), eprops=[weight]) tree=min_spanning_tree(g, weights=weight) return sum(b*weight[e] for (e,b) in zip(g.edges(), tree)) # ------------ Igraph Code --------------------------- def kruskal_igraph(wedges, n): edges, weights= zip(*[((a,b),w) for (a,b,w) in wedges]) g = Graph(n, edges=list(edges)) g.es["weight"] = weights tree=g.spanning_tree(weights) return sum(tree.es["weight"]) # ------------ Scipy Code --------------------------- def wedges2adj(edges, n): G=[[]for _ in range(n)] for a, b, w in edges: G[a].append((b,w)) G[b].append((a,w)) return G def wedges2scr(edges, n): G=wedges2adj(edges, n) indptr=[0] # effectifs cumulés cnt=0 for line in G: cnt+=len(line) indptr.append(cnt) data=[] indices=[] for i in range(n): for (j,w) in G[i]: data.append(w) indices.append(j) return [data, indptr, indices] def csr2wedges(Mcsr, shape): n, p=shape k=0 edges=[] data, cumul, cols = Mcsr.data,Mcsr.indptr, Mcsr.indices for i in range(n): for j in range(cumul[i+1]-cumul[i]): edges.append((i, cols[k], data[k])) k+=1 return edges def kruskal_scipy(edges, n): data, indptr, indices=wedges2scr(edges, n) csr=csr_matrix((data, indices, indptr), shape=(n, n)) tree=minimum_spanning_tree(csr) edges=csr2wedges(tree, (n,n)) return sum(int(round(w)) for (a,b,w) in edges) # ------------ Networkit Code --------------------------- def kruskal_networkit(edges, n): G=nk.Graph(n, weighted=True) for u, v,w in edges: G.addEdge(u,v,-w) msf=RandomMaximumSpanningForest(G) msf.run() weights=[] msf.getMSF().forEdges(lambda u, v, w, eid : weights.append(-w)) return round(sum(weights)) # ------------ NetworkX Code --------------------------- def kruskal_networkX(edges, n): import networkx as nx G=nx.Graph() for a, b, w in edges: G.add_edge(a,b,weight=w) return sum(d['weight'] for (_,_,d) in nx.minimum_spanning_tree(G).edges(data=True)) # ------------ Benchmark --------------------------- def go(solver, L): global duration N=len(L) k=1 while k<N: edges=[] n=L[k] k+=1 for a in range(n): d=L[k] k+=1 for j in range(d): b=L[k] k+=1 w=L[k] k+=1 if a<b: edges.append([a,b-1,w]) if solver==kruskal_scipy: data, indptr, indices=wedges2scr(edges, n) start=clock() csr=csr_matrix((data, indices, indptr), shape=(n, n)) tree=minimum_spanning_tree(csr) edges=csr2wedges(tree, (n,n)) duration+=clock()-start else: start=clock() print(solver(edges, n)) duration+=clock()-start # data L=list(int(z) for z in stdin.read().split() if z.isdigit()) for solver in [kruskal_networkit, kruskal_gt_numpy_sum, kruskal_gt,kruskal_python, kruskal_igraph, kruskal_scipy, kruskal_networkX]: duration=0 go(solver, L) print("%s : %.3f" %(solver.__name__, duration), file=stderr) # *********************** Here is the code to generate data tests: # *********************** from random import randrange def random_tree(n): edges=[] for a in range(1,n): b=randrange(a) edges.append((a,b)) return edges def random_connected_graph(n, m): edges={(a,b) if a<b else (b,a) for (a, b) in random_tree(n)} while len(edges)<m: a=randrange(n) b=randrange(n) t=tuple(sorted([a,b])) if t!=(a,a) and t not in edges: edges.add(t) return [(a,b,randrange(1,10**4)) for (a,b) in edges] def generate_testfile(ntest, maxi, filename="big.in"): # test file for Blinnet problem # www.spoj.com/problems/BLINNET/ testfile=open(filename, "w") out=["%s" %ntest] for i in range(ntest): n=randrange(5,MAXI+1) m=randrange(n-1,min(MAXI, n*(n-1)//2)) G=random_connected_graph(n, m) L=[[] for _ in range(n)] for a,b,p in G: L[a].append((b, p)) L[b].append((a, p)) temp=[] temp.append("%s" %n) for i in range(n): temp.append("fake%s" %(i+1)) temp.append("%s" %len(L[i])) for b, p in L[i]: temp.append("%s %s" %(b+1, p)) out.append('\n'.join(temp)) testfile.write('\n\n'.join(out)) MAXI=30000 ntest=100 filename="big.in" generate_testfile(ntest, MAXI, filename) print("Test %s generated" %filename) # *********************** -- Sent from: http://main-discussion-list-for-the-graph-tool-project.982480.n3.nabble.com/

2574

Age (days ago)

2580

Last active (days ago)

List overview

Download

9 comments

4 participants

participants (4)

Alexandre Hannud Abdo
elastica
elastica＠laposte.net
Tiago de Paula Peixoto

Slow filling graph

tags

participants (4)