Source code for selectfirm
#!/usr/bin/env python
import pandas as pd
import os
import pdb
import networkx as nx
[docs]def main():
"""
code by jason to perform network analysis
attempted to see which firms are connected (have persons from the same family working for different firms)
so that we could isolate "firm clusters"
end result was that almost every firm is connected to each other and so we couldn't do much with firm clusters
"""
firm=pd.read_csv('/home/calsim_data/04MAY2018_tables/input_tables/firm_table.csv')
firm=firm[firm['fake_firm']==0]
firm.drop_duplicates(['firm_id'],inplace=True)
firmlist=firm.firm_id
worker_table=pd.read_csv('/home/calsim_data/04MAY2018_tables/input_tables/worker_table.csv')
dict={}
for f in firmlist:
hieu_df= worker_table[worker_table['firm_id']==f].drop_duplicates(['hieu_id'])
dict[f]=hieu_df.hieu_id.tolist()
adjacent_matrix=pd.DataFrame(index=firmlist,columns=firmlist)
for i in firmlist:
for j in firmlist:
#print "running"+str(i)
intersect=set(dict[i]).intersection(set(dict[j]))
if len(intersect) > 0:
adjacent_matrix[i][j]=1
else:
adjacent_matrix[i][j] = 0
#pdb.set_trace()
#pdb.set_trace()
adjacent_matrix = adjacent_matrix.astype(int)
G=nx.from_pandas_adjacency(adjacent_matrix)
G.name = 'Graph representation from firm adjacent matrix'
print(nx.info(G))
[len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]
node=5000780
connected_component = nx.node_connected_component(G, node)
print (connected_component)
graphs = list(nx.connected_component_subgraphs(G))
for subgraph in graphs:
print('networkx subgraph:', subgraph.nodes())
adjacent_matrix.to_csv('/home/calsim_data/04MAY2018_tables/input_tables/adjacent_matrix.csv')
if __name__ == '__main__':
main()