本文主要是介绍2012联邦选举委员会数据库中赞助人和赞助模式的分析统计,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
本例为《利用Python进行数据分析》中第九章最后一节实例,数据来源Github https://github.com/wesm/pydata-book/tree/2nd-edition/datasets/fec
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from numpy. random import randn, rand
import matplotlib. pyplot as plt
fec= pd. read_csv( 'P00000001-ALL.csv' )
fec. head( )
D:\application_setup\Anaconda\lib\site-packages\IPython\core\interactiveshell.py:3063: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.interactivity=interactivity, compiler=compiler, result=result)
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num 0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166 1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 50.0 23-JUN-11 NaN NaN NaN SA17A 736166 2 C00410118 P20002978 Bachmann, Michelle SMITH, LANIER LANETT AL 3.68633e+08 INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11 NaN NaN NaN SA17A 749073 3 C00410118 P20002978 Bachmann, Michelle BLEVINS, DARONDA PIGGOTT AR 7.24548e+08 NONE RETIRED 250.0 01-AUG-11 NaN NaN NaN SA17A 749073 4 C00410118 P20002978 Bachmann, Michelle WARDENBURG, HAROLD HOT SPRINGS NATION AR 7.19016e+08 NONE RETIRED 300.0 20-JUN-11 NaN NaN NaN SA17A 736166
fec. columns
Index(['cmte_id', 'cand_id', 'cand_nm', 'contbr_nm', 'contbr_city','contbr_st', 'contbr_zip', 'contbr_employer', 'contbr_occupation','contb_receipt_amt', 'contb_receipt_dt', 'receipt_desc', 'memo_cd','memo_text', 'form_tp', 'file_num'],dtype='object')
fec. iloc[ 123456 ]
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456, dtype: object
unique_cands= fec. cand_nm. unique( )
unique_cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy','Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick','Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G','Huntsman, Jon', 'Perry, Rick'], dtype=object)
unique_cands[ 2 ]
'Obama, Barack'
parties= { 'Bachmann, Michelle' : 'Republican' , 'Romney, Mitt' : 'Republican' , 'Gingrich, Newt' : 'Republican' , 'Huntsman, Jon' : 'Republican' , 'Johnson, Gary Earl' : 'Republican' , 'McCotter, Thaddeus G' : 'Republican' , 'Obama, Barack' : 'Democrat' , 'Paul, Ron' : 'Republican' , 'Pawlenty, Timothy' : 'Republican' , 'Perry, Rick' : 'Republican' , "Roemer, Charles E. 'Buddy' III" : 'Republican' , 'Romney, Mitt' : 'Republican' , 'Santorum, Rick' : 'Republican' }
len ( parties)
12
len ( unique_cands)
13
cands= [ name for name in parties]
cands= np. array( cands)
cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Gingrich, Newt','Huntsman, Jon', 'Johnson, Gary Earl', 'McCotter, Thaddeus G','Obama, Barack', 'Paul, Ron', 'Pawlenty, Timothy', 'Perry, Rick',"Roemer, Charles E. 'Buddy' III", 'Santorum, Rick'], dtype='<U30')
np. in1d( unique_cands, cands)
array([ True, True, True, True, True, True, True, True, False,True, True, True, True])
np. setdiff1d( unique_cands, unique_cands[ np. in1d( unique_cands, cands) ] )
array(['Cain, Herman'], dtype=object)
parties[ 'Cain, Herman' ] = 'Republican'
parties
{'Bachmann, Michelle': 'Republican','Romney, Mitt': 'Republican','Gingrich, Newt': 'Republican','Huntsman, Jon': 'Republican','Johnson, Gary Earl': 'Republican','McCotter, Thaddeus G': 'Republican','Obama, Barack': 'Democrat','Paul, Ron': 'Republican','Pawlenty, Timothy': 'Republican','Perry, Rick': 'Republican',"Roemer, Charles E. 'Buddy' III": 'Republican','Santorum, Rick': 'Republican','Cain, Herman': 'Republican'}
fec[ 'party' ] = fec. cand_nm. map ( parties)
fec. head( )
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num party 0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166 Republican 1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 50.0 23-JUN-11 NaN NaN NaN SA17A 736166 Republican 2 C00410118 P20002978 Bachmann, Michelle SMITH, LANIER LANETT AL 3.68633e+08 INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11 NaN NaN NaN SA17A 749073 Republican 3 C00410118 P20002978 Bachmann, Michelle BLEVINS, DARONDA PIGGOTT AR 7.24548e+08 NONE RETIRED 250.0 01-AUG-11 NaN NaN NaN SA17A 749073 Republican 4 C00410118 P20002978 Bachmann, Michelle WARDENBURG, HAROLD HOT SPRINGS NATION AR 7.19016e+08 NONE RETIRED 300.0 20-JUN-11 NaN NaN NaN SA17A 736166 Republican
( fec. contb_receipt_amt> 0 ) . value_counts( )
True 991475
False 10256
Name: contb_receipt_amt, dtype: int64
fec. cand_nm. isin( [ 'Obama, Barack' , 'Romney, Mitt' ] )
0 False
1 False
2 False
3 False
4 False...
1001726 False
1001727 False
1001728 False
1001729 False
1001730 False
Name: cand_nm, Length: 1001731, dtype: bool
fec_mrbo= fec[ fec. cand_nm. isin( [ 'Obama, Barack' , 'Romney, Mitt' ] ) ]
fec_mrbo. head( )
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num party 411 C00431171 P80003353 Romney, Mitt ELDERBAUM, WILLIAM DPO AA 3.4023e+08 US GOVERNMENT FOREIGN SERVICE OFFICER 25.0 01-FEB-12 NaN NaN NaN SA17A 780124 Republican 412 C00431171 P80003353 Romney, Mitt ELDERBAUM, WILLIAM DPO AA 3.4023e+08 US GOVERNMENT FOREIGN SERVICE OFFICER 110.0 01-FEB-12 NaN NaN NaN SA17A 780124 Republican 413 C00431171 P80003353 Romney, Mitt CARLSEN, RICHARD APO AE 9.128e+07 DEFENSE INTELLIGENCE AGENCY INTELLIGENCE ANALYST 250.0 13-APR-12 NaN NaN NaN SA17A 785689 Republican 414 C00431171 P80003353 Romney, Mitt DELUCA, PIERRE APO AE 9.128e+07 CISCO ENGINEER 30.0 21-AUG-11 NaN NaN NaN SA17A 760261 Republican 415 C00431171 P80003353 Romney, Mitt SARGENT, MICHAEL APO AE 9.01201e+07 RAYTHEON TECHNICAL SERVICES CORP COMPUTER SYSTEMS ENGINEER 100.0 07-MAR-12 NaN NaN NaN SA17A 780128 Republican
occs= fec[ 'contbr_occupation' ] . unique( )
occs
array(['RETIRED', 'INFORMATION REQUESTED', 'RN', ...,'SAFETY SALES MANAGER', 'PRESIDENT & GENERAL MANAGER','LONGWALL MAINTENANCE FOREMAN'], dtype=object)
len ( fec[ 'contbr_occupation' ] . unique( ) )
45074
occ_mapping= { 'INFORMATION REQUESTED' : 'NOT PROVIDED' , 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED' , 'INFORMATION REQUESTED(BEST EFFORTS)' : 'NOT PROVIDED' , 'C.E.O.' : 'CEO'
}
fec. groupby( 'contbr_occupation' ) [ 'party' ] . count( )
contbr_occupationMIXED-MEDIA ARTIST / STORYTELLER 1AREA VICE PRESIDENT 1RESEARCH ASSOCIATE 1TEACHER 1THERAPIST 3..
ZOOKEEPER 1
ZOOLOGIST 3
ZOOLOGY EDUCATION 1
\NONE\ 1
~ 1
Name: party, Length: 45073, dtype: int64
f= lambda x: occ_mapping. get( x, x)
fec. contbr_occupation= fec. contbr_occupation. map ( f)
emp_mapping= { 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED' , 'INFORMATION REQUESTED' : 'NOT PROVIDED' , 'SELF' : 'SELF-EMPLOYED' , 'SELF EMPLOYED' : 'SELF-EMPLOYED' }
f= lambda x: emp_mapping. get( x, x)
fec. contbr_employer= fec. contbr_employer. map ( f)
by_occupation= fec. pivot_table( values= 'contb_receipt_amt' , index= 'contbr_occupation' , columns= 'party' , aggfunc= 'sum' )
by_occupation
party Democrat Republican contbr_occupation MIXED-MEDIA ARTIST / STORYTELLER 100.0 NaN AREA VICE PRESIDENT 250.0 NaN RESEARCH ASSOCIATE 100.0 NaN TEACHER 500.0 NaN THERAPIST 3900.0 NaN ... ... ... ZOOKEEPER 35.0 NaN ZOOLOGIST 400.0 NaN ZOOLOGY EDUCATION 25.0 NaN \NONE\ NaN 250.0 ~ NaN 75.0
45071 rows × 2 columns
over_2mm= by_occupation[ by_occupation. sum ( 1 ) > 2000000 ]
over_2mm
party Democrat Republican contbr_occupation ATTORNEY 11126932.97 7.343540e+06 CEO 2071474.79 3.932686e+06 CONSULTANT 2459812.71 2.472815e+06 ENGINEER 950425.55 1.780102e+06 EXECUTIVE 1355161.05 3.918557e+06 HOMEMAKER 4243394.30 1.324141e+07 INVESTOR 884133.00 2.320349e+06 LAWYER 3159391.87 3.785903e+05 MANAGER 762693.22 1.404878e+06 NOT PROVIDED 4849801.96 1.975207e+07 OWNER 998867.36 2.279621e+06 PHYSICIAN 3732387.44 3.491657e+06 PRESIDENT 1878009.95 4.469834e+06 PROFESSOR 2163571.08 2.944627e+05 REAL ESTATE 528752.09 1.581747e+06 RETIRED 25270507.23 2.290594e+07 SELF-EMPLOYED 667393.40 1.591758e+06
over_2mm. plot( kind= 'barh' )
<matplotlib.axes._subplots.AxesSubplot at 0x1700e31efc8>
fec_mrbo
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num party 411 C00431171 P80003353 Romney, Mitt ELDERBAUM, WILLIAM DPO AA 3.4023e+08 US GOVERNMENT FOREIGN SERVICE OFFICER 25.0 01-FEB-12 NaN NaN NaN SA17A 780124 Republican 412 C00431171 P80003353 Romney, Mitt ELDERBAUM, WILLIAM DPO AA 3.4023e+08 US GOVERNMENT FOREIGN SERVICE OFFICER 110.0 01-FEB-12 NaN NaN NaN SA17A 780124 Republican 413 C00431171 P80003353 Romney, Mitt CARLSEN, RICHARD APO AE 9.128e+07 DEFENSE INTELLIGENCE AGENCY INTELLIGENCE ANALYST 250.0 13-APR-12 NaN NaN NaN SA17A 785689 Republican 414 C00431171 P80003353 Romney, Mitt DELUCA, PIERRE APO AE 9.128e+07 CISCO ENGINEER 30.0 21-AUG-11 NaN NaN NaN SA17A 760261 Republican 415 C00431171 P80003353 Romney, Mitt SARGENT, MICHAEL APO AE 9.01201e+07 RAYTHEON TECHNICAL SERVICES CORP COMPUTER SYSTEMS ENGINEER 100.0 07-MAR-12 NaN NaN NaN SA17A 780128 Republican ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 701381 C00431445 P80003338 Obama, Barack MOUNTS, ROBERT APO ZZ 962043077 HQ USFK (FKDC-SA) GS-15 INTERNATIONAL RELATIONS OFFICER 25.0 26-FEB-12 NaN NaN NaN SA17A 787803 Democrat 701382 C00431445 P80003338 Obama, Barack TAITANO, TYRONE HAGATNA ZZ 969323373 NOT EMPLOYED RETIRED 250.0 20-JAN-12 NaN NaN NaN SA17A 775668 Democrat 701383 C00431445 P80003338 Obama, Barack TUCKER, PAMELA APO ZZ 963190030 DODEA EDUCATOR 3.0 20-JAN-12 NaN NaN NaN SA17A 775668 Democrat 701384 C00431445 P80003338 Obama, Barack MOUNTS, ROBERT APO ZZ 962043077 HQ USFK (FKDC-SA) GS-15 INTERNATIONAL RELATIONS OFFICER 25.0 26-APR-12 NaN NaN NaN SA17A 785239 Democrat 701385 C00431445 P80003338 Obama, Barack NEAL, AMBER APO ZZ 091022065 THE DEPARTMENT OF DEFENSE EDUCATION AC TEACHER 135.0 04-SEP-11 NaN X * OBAMA VICTORY FUND 2012 SA18 756218 Democrat
700975 rows × 17 columns
grouped= fec_mrbo. groupby( 'cand_nm' )
def get_top_amounts ( group, key, n= 5 ) : totals= group. groupby( key) [ 'contb_receipt_amt' ] . sum ( ) return ( totals. sort_values( ascending= False ) [ : n] )
grouped. apply ( get_top_amounts, 'contbr_occupation' , n= 7 )
cand_nm contbr_occupation
Obama, Barack RETIRED 25270507.23ATTORNEY 11126932.97NOT PROVIDED 4849801.96HOMEMAKER 4243394.30PHYSICIAN 3732387.44LAWYER 3159391.87CONSULTANT 2459812.71
Romney, Mitt RETIRED 11266949.23NOT PROVIDED 11173374.84HOMEMAKER 8037250.86ATTORNEY 5302578.82PRESIDENT 2403439.77CEO 2247242.03EXECUTIVE 2230653.79
Name: contb_receipt_amt, dtype: float64
grouped. apply ( get_top_amounts, 'contbr_employer' , n= 10 )
cand_nm contbr_employer
Obama, Barack RETIRED 22665902.20SELF-EMPLOYED 18584277.16NOT EMPLOYED 8584118.70NOT PROVIDED 5036178.37HOMEMAKER 2599987.04STUDENT 318831.45VOLUNTEER 257104.00MICROSOFT 215585.36SIDLEY AUSTIN LLP 168254.00REFUSED 149516.07
Romney, Mitt NOT PROVIDED 11827237.12RETIRED 11264701.35HOMEMAKER 8037000.86SELF-EMPLOYED 7265136.53STUDENT 488642.82CREDIT SUISSE 265650.00MORGAN STANLEY 262266.00GOLDMAN SACH & CO. 233250.00BARCLAYS CAPITAL 162750.00H.I.G. CAPITAL 139500.00
Name: contb_receipt_amt, dtype: float64
bins= np. array( [ 0 , 1 , 10 , 100 , 1000 , 10000 , 100000 , 1000000 , 10000000 ] )
labels= pd. cut( fec_mrbo. contb_receipt_amt, bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 700975, dtype: category
Categories (8, interval[int64]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped= fec_mrbo. groupby( [ 'cand_nm' , labels] )
grouped. size( )
cand_nm contb_receipt_amt
Obama, Barack (0, 1] 493(1, 10] 40070(10, 100] 372280(100, 1000] 153991(1000, 10000] 22284(10000, 100000] 2(100000, 1000000] 3(1000000, 10000000] 4
Romney, Mitt (0, 1] 77(1, 10] 3681(10, 100] 31853(100, 1000] 43357(1000, 10000] 26186(10000, 100000] 1(100000, 1000000] 0(1000000, 10000000] 0
dtype: int64
grouped. size( ) . unstack( 0 )
cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 493 77 (1, 10] 40070 3681 (10, 100] 372280 31853 (100, 1000] 153991 43357 (1000, 10000] 22284 26186 (10000, 100000] 2 1 (100000, 1000000] 3 0 (1000000, 10000000] 4 0
bucket_sums= grouped. contb_receipt_amt. sum ( )
bucket_sums
cand_nm contb_receipt_amt
Obama, Barack (0, 1] 318.24(1, 10] 337267.62(10, 100] 20288981.41(100, 1000] 54798531.46(1000, 10000] 51753705.67(10000, 100000] 59100.00(100000, 1000000] 1490683.08(1000000, 10000000] 7148839.76
Romney, Mitt (0, 1] 77.00(1, 10] 29819.66(10, 100] 1987783.76(100, 1000] 22363381.69(1000, 10000] 63942145.42(10000, 100000] 12700.00(100000, 1000000] NaN(1000000, 10000000] NaN
Name: contb_receipt_amt, dtype: float64
bucket_sums= bucket_sums. unstack( 0 )
bucket_sums
cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 318.24 77.00 (1, 10] 337267.62 29819.66 (10, 100] 20288981.41 1987783.76 (100, 1000] 54798531.46 22363381.69 (1000, 10000] 51753705.67 63942145.42 (10000, 100000] 59100.00 12700.00 (100000, 1000000] 1490683.08 NaN (1000000, 10000000] 7148839.76 NaN
normed_sums= bucket_sums. div( bucket_sums. sum ( axis= 1 ) , axis= 0 )
normed_sums
cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 0.805182 0.194818 (1, 10] 0.918767 0.081233 (10, 100] 0.910769 0.089231 (100, 1000] 0.710176 0.289824 (1000, 10000] 0.447326 0.552674 (10000, 100000] 0.823120 0.176880 (100000, 1000000] 1.000000 NaN (1000000, 10000000] 1.000000 NaN
normed_sums[ : - 2 ] . plot( kind= 'barh' , stacked= True )
<matplotlib.axes._subplots.AxesSubplot at 0x1700ecfd988>
grouped= fec_mrbo. groupby( [ 'cand_nm' , 'contbr_st' ] )
totals= grouped. contb_receipt_amt. sum ( ) . unstack( 0 ) . fillna( 0 )
totals= totals[ totals. sum ( 1 ) > 100000 ]
totals[ : 10 ]
cand_nm Obama, Barack Romney, Mitt contbr_st AK 275353.15 86204.24 AL 537835.48 504882.08 AR 324802.28 105351.50 AZ 1484241.84 1850484.23 CA 23370680.84 10908232.46 CO 2104551.38 1477810.82 CT 2037216.66 3377421.85 DC 4317865.85 999740.50 DE 325394.14 81404.00 FL 7138932.52 8008067.40
percent= totals. div( totals. sum ( 1 ) , axis= 0 )
percent
cand_nm Obama, Barack Romney, Mitt contbr_st AK 0.761575 0.238425 AL 0.515802 0.484198 AR 0.755084 0.244916 AZ 0.445087 0.554913 CA 0.681780 0.318220 CO 0.587476 0.412524 CT 0.376242 0.623758 DC 0.811994 0.188006 DE 0.799891 0.200109 FL 0.471310 0.528690 GA 0.587490 0.412510 HI 0.876334 0.123666 IA 0.741376 0.258624 ID 0.200424 0.799576 IL 0.827649 0.172351 IN 0.622549 0.377451 KS 0.575167 0.424833 KY 0.516502 0.483498 LA 0.357843 0.642157 MA 0.591256 0.408744 MD 0.750631 0.249369 ME 0.910530 0.089470 MI 0.522705 0.477295 MN 0.855263 0.144737 MO 0.495127 0.504873 MS 0.374593 0.625407 MT 0.653063 0.346937 NC 0.655879 0.344121 NE 0.587431 0.412569 NH 0.586226 0.413774 NJ 0.490687 0.509313 NM 0.844133 0.155867 NV 0.540362 0.459638 NY 0.595393 0.404607 OH 0.489412 0.510588 OK 0.413549 0.586451 OR 0.696695 0.303305 PA 0.661856 0.338144 PR 0.889772 0.110228 RI 0.707851 0.292149 SC 0.610531 0.389469 SD 0.345948 0.654052 TN 0.424830 0.575170 TX 0.518505 0.481495 UT 0.122948 0.877052 VA 0.555402 0.444598 VT 0.946176 0.053824 WA 0.765634 0.234366 WI 0.805343 0.194657 WV 0.573891 0.426109 WY 0.440823 0.559177 XX 0.000000 1.000000
这篇关于2012联邦选举委员会数据库中赞助人和赞助模式的分析统计的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!