본문 바로가기
AI & Data Analysis/Kaggle Notebook

[EDA & Visualization] San_Francisco Data

by 로토마 2022. 3. 31.

https://www.kaggle.com/code/emilyjiminroh/eda-visualization-san-francisco-notebook

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/san-francisco-crime-classification/train.csv
/kaggle/input/san-francisco-crime-classification/test.csv
In [2]:
# 데이터를 우선 가져와야합니다.
train = pd.read_csv('../input/san-francisco-crime-classification/train.csv')
train
Out[2]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY01234...878044878045878046878047878048
2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2015-05-13 23:53:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2015-05-13 23:33:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.424363 37.800414
2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday NORTHERN NONE 1500 Block of LOMBARD ST -122.426995 37.800873
2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday PARK NONE 100 Block of BRODERICK ST -122.438738 37.771541
... ... ... ... ... ... ... ... ...
2003-01-06 00:15:00 ROBBERY ROBBERY ON THE STREET WITH A GUN Monday TARAVAL NONE FARALLONES ST / CAPITOL AV -122.459033 37.714056
2003-01-06 00:01:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Monday INGLESIDE NONE 600 Block of EDNA ST -122.447364 37.731948
2003-01-06 00:01:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Monday SOUTHERN NONE 5TH ST / FOLSOM ST -122.403390 37.780266
2003-01-06 00:01:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM OF VEHICLES Monday SOUTHERN NONE TOWNSEND ST / 2ND ST -122.390531 37.780607
2003-01-06 00:01:00 FORGERY/COUNTERFEITING CHECKS, FORGERY (FELONY) Monday BAYVIEW NONE 1800 Block of NEWCOMB AV -122.394926 37.738212

878049 rows × 9 columns

In [3]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB

NULL 값 확인

In [4]:
# null 값이 존재하는가?
train.isnull().sum() 
Out[4]:
Dates         0
Category      0
Descript      0
DayOfWeek     0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
dtype: int64

중복 값 확인 후 처리

In [5]:
# 중복 값 확인
train.duplicated().sum()
Out[5]:
2323
In [6]:
#shape는 데이터의 구조를 알려준다. (row수와 column수)
shape_before = train.shape
shape_before
Out[6]:
(878049, 9)
In [7]:
# DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
train.drop_duplicates(subset=None,keep="first",inplace=True)
In [8]:
shape_after = train.shape
shape_after
Out[8]:
(875726, 9)
In [9]:
# 데이터 중복 처리 결과 확인
print('original_sample: ',shape_before,
     '\nafter_drop_sample: ', shape_after,
     '\nTotal_drop_Check: ',shape_before[0],'-',shape_after[0],
      '=', shape_before[0]-shape_after[0],'Samples')
original_sample:  (878049, 9) 
after_drop_sample:  (875726, 9) 
Total_drop_Check:  878049 - 875726 = 2323 Samples

범죄 종류 당 횟수 출력

In [10]:
# 같은 카테고리를 가진 값들 개수 반환
pd.DataFrame(train['Category'].value_counts(ascending=False))
Out[10]:
CategoryLARCENY/THEFTOTHER OFFENSESNON-CRIMINALASSAULTDRUG/NARCOTICVEHICLE THEFTVANDALISMWARRANTSBURGLARYSUSPICIOUS OCCMISSING PERSONROBBERYFRAUDFORGERY/COUNTERFEITINGSECONDARY CODESWEAPON LAWSPROSTITUTIONTRESPASSSTOLEN PROPERTYSEX OFFENSES FORCIBLEDISORDERLY CONDUCTDRUNKENNESSRECOVERED VEHICLEKIDNAPPINGDRIVING UNDER THE INFLUENCELIQUOR LAWSRUNAWAYARSONLOITERINGEMBEZZLEMENTSUICIDEFAMILY OFFENSESBAD CHECKSBRIBERYEXTORTIONSEX OFFENSES NON FORCIBLEGAMBLINGPORNOGRAPHY/OBSCENE MATTREA
174320
125960
91915
76815
53919
53706
44581
42145
36600
31394
25669
22988
16637
10592
9979
8550
7446
7318
4537
4380
4313
4277
3132
2340
2268
1899
1894
1512
1207
1164
508
488
406
289
256
148
146
22
6

범죄 종류 당 횟수 시각화

In [11]:
# 시각화 with matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# subplots 하나의 그래프에 여러 값들 겹쳐서 나타내기
# plt.subplot(row,column,index)
fig, count = plt.subplots(figsize=(17,15))
# countplot: 각 카데고리 값 별로 데이터가 얼마나 있는지 표시
count = sns.countplot(y=train['Category'])
count.set_title('Crime counts')

# 내림차순으로 보이게 하려면??
# order 인자 추가 아래 코드!!
Out[11]:
Text(0.5, 1.0, 'Crime counts')

내림차순 정렬

In [12]:
fig, count = plt.subplots(figsize=(17,15))
count = sns.countplot(y=train['Category'], order = train['Category'].value_counts().index)
count.set_title('Crime counts with decending')
Out[12]:
Text(0.5, 1.0, 'Crime counts with decending')

범죄 종류 별 빈도 파악

In [13]:
# frequency랑 누적_frequency의 차이점은? 
# frequency는 고유의 빈도수
# cumulative_freq는 이전 고유 빈도수의 누적 + 현재의 누적 값

# 고유 행수가 포함된 열 개수반환, normalize 비율 반환
category_freq = pd.DataFrame(train['Category'].value_counts(normalize=True))
# index 다시 생성
category_freq.reset_index(inplace=True)
# index -> Category, Category -> frequency
category_freq.rename({'index': 'Category', 'Category': 'Frequency'}, axis='columns', inplace=True)
# 누적 합계를 포함하는 동일한 크기의 DataFrame 반환
category_freq['Cumulative_Frequency'] = category_freq['Frequency'].cumsum()

category_freq
Out[13]:
CategoryFrequencyCumulative_Frequency01234567891011121314151617181920212223242526272829303132333435363738
LARCENY/THEFT 0.199058 0.199058
OTHER OFFENSES 0.143835 0.342893
NON-CRIMINAL 0.104959 0.447851
ASSAULT 0.087716 0.535567
DRUG/NARCOTIC 0.061571 0.597138
VEHICLE THEFT 0.061327 0.658465
VANDALISM 0.050907 0.709373
WARRANTS 0.048126 0.757498
BURGLARY 0.041794 0.799292
SUSPICIOUS OCC 0.035849 0.835141
MISSING PERSON 0.029312 0.864453
ROBBERY 0.026250 0.890703
FRAUD 0.018998 0.909701
FORGERY/COUNTERFEITING 0.012095 0.921796
SECONDARY CODES 0.011395 0.933191
WEAPON LAWS 0.009763 0.942955
PROSTITUTION 0.008503 0.951457
TRESPASS 0.008356 0.959814
STOLEN PROPERTY 0.005181 0.964995
SEX OFFENSES FORCIBLE 0.005002 0.969996
DISORDERLY CONDUCT 0.004925 0.974921
DRUNKENNESS 0.004884 0.979805
RECOVERED VEHICLE 0.003576 0.983382
KIDNAPPING 0.002672 0.986054
DRIVING UNDER THE INFLUENCE 0.002590 0.988644
LIQUOR LAWS 0.002168 0.990812
RUNAWAY 0.002163 0.992975
ARSON 0.001727 0.994702
LOITERING 0.001378 0.996080
EMBEZZLEMENT 0.001329 0.997409
SUICIDE 0.000580 0.997989
FAMILY OFFENSES 0.000557 0.998546
BAD CHECKS 0.000464 0.999010
BRIBERY 0.000330 0.999340
EXTORTION 0.000292 0.999632
SEX OFFENSES NON FORCIBLE 0.000169 0.999801
GAMBLING 0.000167 0.999968
PORNOGRAPHY/OBSCENE MAT 0.000025 0.999993
TREA 0.000007 1.000000

약 80%의 범죄 유형 데이터 정리

In [14]:
# 누적 frequency가 0.8 이하이전의 범죄 유형 리스트 만들기
candidate_classes = list(category_freq.loc[category_freq['Cumulative_Frequency'] < 0.8, 'Category'])
candidate_classes
Out[14]:
['LARCENY/THEFT',
 'OTHER OFFENSES',
 'NON-CRIMINAL',
 'ASSAULT',
 'DRUG/NARCOTIC',
 'VEHICLE THEFT',
 'VANDALISM',
 'WARRANTS',
 'BURGLARY']
In [15]:
# 데이터 프레임 슬라이싱
# 위에서 만든 리스트에 해당되는 데이터 행 불러오기
train_reduced_categories = train.loc[train['Category'].isin(candidate_classes)]
train_reduced_categories 
Out[15]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY01234...878042878043878045878046878047
2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2015-05-13 23:53:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2015-05-13 23:33:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.424363 37.800414
2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday NORTHERN NONE 1500 Block of LOMBARD ST -122.426995 37.800873
2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday PARK NONE 100 Block of BRODERICK ST -122.438738 37.771541
... ... ... ... ... ... ... ... ...
2003-01-06 00:20:00 ASSAULT ATTEMPTED HOMICIDE WITH A GUN Monday BAYVIEW ARREST, BOOKED 1500 Block of SHAFTER AV -122.389769 37.730564
2003-01-06 00:20:00 OTHER OFFENSES PAROLE VIOLATION Monday BAYVIEW ARREST, BOOKED 1500 Block of SHAFTER AV -122.389769 37.730564
2003-01-06 00:01:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Monday INGLESIDE NONE 600 Block of EDNA ST -122.447364 37.731948
2003-01-06 00:01:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Monday SOUTHERN NONE 5TH ST / FOLSOM ST -122.403390 37.780266
2003-01-06 00:01:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM OF VEHICLES Monday SOUTHERN NONE TOWNSEND ST / 2ND ST -122.390531 37.780607

699961 rows × 9 columns

누적 80%를 차지하는 범죄유형 시각화

In [16]:
# 내림차순 정렬은 countplot에서 oreder라는 인자 추가해서 조정하면 된다!
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = train_reduced_categories['Category'],order=train_reduced_categories['Category'].value_counts().index)
count.set_title('Most common types of crimes with descending')
Out[16]:
Text(0.5, 1.0, 'Most common types of crimes with descending')
In [17]:
warrants = train_reduced_categories.loc[train_reduced_categories['Category'] == 'WARRANTS']
warrants
Out[17]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY022666775...877968877995878026878027878038
2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2015-05-13 21:11:00 WARRANTS WARRANT ARREST Wednesday TENDERLOIN NONE 100 Block of JONES ST -122.412250 37.782556
2015-05-13 19:11:00 WARRANTS ENROUTE TO PAROLE OFFICER Wednesday TENDERLOIN NONE 400 Block of HYDE ST -122.416236 37.784913
2015-05-13 19:11:00 WARRANTS ENROUTE TO OUTSIDE JURISDICTION Wednesday TENDERLOIN NONE 400 Block of HYDE ST -122.416236 37.784913
2015-05-13 18:58:00 WARRANTS ENROUTE TO OUTSIDE JURISDICTION Wednesday MISSION NONE 1900 Block of MISSION ST -122.419677 37.765851
... ... ... ... ... ... ... ... ...
2003-01-06 08:45:00 WARRANTS ENROUTE TO OUTSIDE JURISDICTION Monday SOUTHERN ARREST, BOOKED 0 Block of UNITED NATIONS PZ -122.414318 37.779944
2003-01-06 07:08:00 WARRANTS ENROUTE TO OUTSIDE JURISDICTION Monday MISSION ARREST, BOOKED 3000 Block of 16TH ST -122.421663 37.764876
2003-01-06 02:06:00 WARRANTS WARRANT ARREST Monday BAYVIEW ARREST, BOOKED NEWHALL ST / GALVEZ AV -122.387710 37.740674
2003-01-06 02:00:00 WARRANTS ENROUTE TO OUTSIDE JURISDICTION Monday SOUTHERN ARREST, BOOKED 900 Block of MARKET ST -122.409708 37.782828
2003-01-06 00:42:00 WARRANTS ENROUTE TO OUTSIDE JURISDICTION Monday TENDERLOIN ARREST, BOOKED TAYLOR ST / GEARY ST -122.411519 37.786941

42145 rows × 9 columns

In [18]:
warrants['Descript'].value_counts()
Out[18]:
WARRANT ARREST                          23702
ENROUTE TO OUTSIDE JURISDICTION         11456
ENROUTE TO DEPARTMENT OF CORRECTIONS     3245
ENROUTE TO PAROLE OFFICER                2149
ENROUTE TO ADULT AUTHORITY                884
PROBATION SEARCH                          207
ENROUTE TO PROBATION OFFICER              207
ENROUTE TO U.S. MARSHALL                  178
POST RELEASE COMMUNITY SUPERVISION         83
PAROLE SEARCH                              34
Name: Descript, dtype: int64
In [19]:
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = warrants['Descript'])
count.set_title('Warrant Descriptions Counts')
Out[19]:
Text(0.5, 1.0, 'Warrant Descriptions Counts')
In [20]:
# 내림차순 정렬
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = warrants['Descript'],order=warrants['Descript'].value_counts().index)
count.set_title('Warrant Descriptions Counts with descending')
Out[20]:
Text(0.5, 1.0, 'Warrant Descriptions Counts with descending')
In [21]:
others = train_reduced_categories.loc[train_reduced_categories['Category'] == 'OTHER OFFENSES']
others
Out[21]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY12111829...878029878032878039878041878043
2015-05-13 23:53:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2015-05-13 23:33:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.424363 37.800414
2015-05-13 22:30:00 OTHER OFFENSES MISCELLANEOUS INVESTIGATION Wednesday TARAVAL NONE 0 Block of ESCOLTA WY -122.487983 37.737667
2015-05-13 21:40:00 OTHER OFFENSES TRAFFIC VIOLATION Wednesday BAYVIEW ARREST, BOOKED MENDELL ST / HUDSON AV -122.386401 37.738983
2015-05-13 20:56:00 OTHER OFFENSES MISCELLANEOUS INVESTIGATION Wednesday TARAVAL NONE 2000 Block of 41ST AV -122.499787 37.748518
... ... ... ... ... ... ... ... ...
2003-01-06 01:54:00 OTHER OFFENSES PROBATION VIOLATION Monday TENDERLOIN ARREST, BOOKED 1400 Block of GOLDEN GATE AV -122.434423 37.779193
2003-01-06 01:36:00 OTHER OFFENSES DRIVERS LICENSE, SUSPENDED OR REVOKED Monday NORTHERN ARREST, CITED GEARY BL / FRANKLIN ST -122.423031 37.785482
2003-01-06 00:40:00 OTHER OFFENSES DRIVERS LICENSE, SUSPENDED OR REVOKED Monday NORTHERN ARREST, CITED POLK ST / CALIFORNIA ST -122.420692 37.790577
2003-01-06 00:31:00 OTHER OFFENSES DRIVERS LICENSE, SUSPENDED OR REVOKED Monday RICHMOND ARREST, CITED CLEMENT ST / 14TH AV -122.472985 37.782552
2003-01-06 00:20:00 OTHER OFFENSES PAROLE VIOLATION Monday BAYVIEW ARREST, BOOKED 1500 Block of SHAFTER AV -122.389769 37.730564

125960 rows × 9 columns

In [22]:
others['Descript'].unique()
Out[22]:
array(['TRAFFIC VIOLATION ARREST', 'MISCELLANEOUS INVESTIGATION',
       'TRAFFIC VIOLATION', 'PROBATION VIOLATION',
       'STAY AWAY ORDER VIOLATION, DV RELATED',
       'DRIVERS LICENSE, SUSPENDED OR REVOKED', 'RESISTING ARREST',
       'LOST/STOLEN LICENSE PLATE',
       'FRAUDULENT GAME OR TRICK, OBTAINING MONEY OR PROPERTY',
       'RECKLESS DRIVING', 'VIOLATION OF RESTRAINING ORDER',
       'VIOLATION OF MUNICIPAL CODE', 'PAROLE VIOLATION',
       'HARASSING PHONE CALLS', 'INCIDENT ON SCHOOL GROUNDS',
       'VIOLATION OF PARK CODE', 'POSSESSION OF BURGLARY TOOLS W/PRIORS',
       'CONSPIRACY',
       'RESTRAINING ORDER NOTIFICATION/SERVICE OF RESTRAINING ORDER',
       'TAMPERING WITH A VEHICLE', 'VIOLATION OF MUNICIPAL POLICE CODE',
       'FALSE PERSONATION', 'POSSESSION OF BURGLARY TOOLS',
       'FALSE PERSONATION TO RECEIVE MONEY OR PROPERTY',
       'TRAFFIC COLLISION, HIT & RUN, INJURY', 'OBSCENE PHONE CALLS(S)',
       'EVADING A POLICE OFFICER RECKLESSLY', 'DEFRAUDING TAXI DRIVER',
       'PERMIT VIOLATION, POLICE (GENERAL)',
       'FALSE EVIDENCE OF VEHICLE REGISTRATION', 'FALSE FIRE ALARM',
       'DOG, STRAY OR VICIOUS', 'INJURY TO TELEGRAPH/TELEPHONE LINES',
       'TRAFFIC COLLISION, HIT & RUN, PROPERTY DAMAGE',
       'FAILURE TO REGISTER AS SEX OFFENDER',
       'PROBATION VIOLATION, DV RELATED',
       'VEHICLE ALARM CODE GRABBING DEVICE, POSSESS OR USE',
       'OBSTRUCTIONS ON STREETS/SIDEWALKS',
       'VIOLATION OF EMERGENCY PROTECTIVE ORDER', 'INDECENT EXPOSURE',
       'VIOLATION OF STAY AWAY ORDER', 'MISCELLANEOUS STATE FELONY',
       'MISCELLANEOUS STATE MISDEMEANOR', 'CRUELTY TO ANIMALS',
       'SPEEDING', 'SCHOOL, PUBLIC, TRESPASS',
       'DRIVING, RECKLESS, WITH INJURY',
       'OPEN CONTAINER OF ALCOHOL IN VEHICLE',
       'SCHOOL PROPERTY, DISTURBANCE ON',
       'FALSE PERSONATION AND CHEAT CRIMES (GENERAL)', 'CURFEW VIOLATION',
       'DESTITUTE MINOR', 'BEYOND PARENTAL CONTROL',
       'FIREARMS, SEIZING AT SCENE OF DV', 'ANIMAL, FIGHTING',
       'DANGER OF LEADING IMMORAL LIFE',
       'DRIVES VEHICLE ALONG TRACK OF RAILROAD', 'PHONE CALLS, OBSCENE',
       'VIOLATION OF FIRE CODE',
       'PHONE CALLS IN VIOLATION OF DV COURT ORDER',
       'VIN, ALTER OR REMOVE', 'EVADING A POLICE VEHICLE OR BICYCLE',
       'CONTRIBUTING TO THE DELINQUENCY OF MINOR',
       'FALSE REPORT OF CRIME',
       'MONEY, PROPERTY OR LABOR, FRAUDULENTLY OBTAINING',
       'THROWING INJURIOUS SUBSTANCE ON HIGHWAY',
       'TOBACCO PRODUCTS, SELLING OR FURNISHING TO MINOR',
       'VEHICLE, DISABLED PLACARD VIOLATION',
       'PUBLIC TRANSIT CRIMES - INFRACTIONS',
       'EVADING PAYMENT OF RAILROAD FARE', 'INTOXICATED JUVENILE',
       'POSSESSION OF ARTICLES WITH IDENTIFICATION REMOVE',
       'INDECENT EXPOSURE (JUVENILE VICTIM)',
       'INTERFERRING WITH A FIREMAN', 'THROWING SUBSTANCE AT VEHICLE',
       'MINOR PURCHASING OR RECEIVING TOBACCO PRODUCT',
       'VIOLATION OF FEDERAL STATUTE',
       'OTHER OFFENSES AGAINST PUBLIC JUSTICE',
       'PEDDLING WITHOUT A LICENSE',
       'FAILURE TO HEED RED LIGHT AND SIREN',
       'PERMIT VIOLATION, ENTERTAINMENT',
       'INDECENT EXPOSURE WITH PRIOR CONVICTION',
       'INTERFERRING WITH A POLICE OFFICER',
       'COMPUTER SYSTEM, ACCESSING, COPYING, OR DAMAGING',
       'PLACING TRASH ON THE STREET', 'SPITTING ON SIDEWALK',
       'ACTS AGAINST PUBLIC TRANSIT',
       'HAZARDOUS MATERIALS, SPILL ON ROADWAY',
       'OPERATING TAXI WITHOUT A PERMIT', 'FALSE REPORT OF BOMB',
       'THROWING OBJECT AT COMMON CARRIER, PASSENGER OR FREIGHT',
       'DUMPING OF OFFENSIVE MATTER', 'PERMIT VIOLATION, VALET PARKING',
       'JUDGE/JUROR ACCEPTING A BRIBE',
       'WEARING MASK OR DISGUISE FOR UNLAWFUL PURPOSE',
       'POSSESSION OF FIRECRACKERS', 'AGGRESSIVE SOLICITING',
       'HABITUAL TRUANT', 'ESCAPE OR ASSISTING ESCAPE',
       'TAKING CONTRABAND INTO A REFORMATORY',
       'DISRUPTS SCHOOL ACTIVITIES', 'PHONE CALLS, HARASSING, TO 911',
       'RIOT', 'DOG, BARKING', 'SOLICITING COMMISSION OF A CRIME',
       'DOG OR CAT, ABANDONMENT OF',
       'EVADING A POLICE OFFICER, INJURY OR DEATH',
       'FIREWORKS, THROW AT PERSON OR DISCHARGE IN CROWD',
       'LYNCHING BY RIOT', 'PEEPING TOM', 'ABORTION',
       'AID OR HARBOR FELON', 'ANIMAL, WITHOUT PROPER CARE OR ATTENTION',
       'LICENSE PLATE, STOLEN', 'SCALPING TICKETS',
       'FOOD STAMPS, MISUSE OF',
       'INSURED PROPERTY, DESTRUCTION TO DEFRAUD INSURER',
       'MASSAGE ESTABLISHMENT PERMIT VIOLATION',
       'DISCHARGING OFFENSIVE OR INJURIOUS SUBSTANCE IN PUBLIC AREA',
       'VIOLATION OF CIVIL GANG INJUNCTION',
       'PERMIT VIOLATION, SIDEWALK SALES',
       'SELLING/DISCHARGING OF FIRECRACKERS',
       'ADVERTISING DISTRIBUTORS PERMIT VIOLATION',
       'HAZARDOUS MATERIALS, DUMPING IN UNAUTHORIZED LOCATN',
       'RESCUING PRISONER FROM LAWFUL CUSTODY',
       'VIOLATION OF CALIF UNEMPLOYMENT INSURANCE ACT',
       'HAZARDOUS SUBSTANCES, DEPOSITING',
       'SCHOOL GROUNDS, ENTRY BY SEX OFFENDER', 'ESCAPEE, JUVENILE',
       'TAMPERING WITH MAIL', 'SELLING RESTRICTED GLUE TO JUVENILES',
       'FORTUNE TELLING',
       'SCHOOL STUDENT OR EMPLOYEE ENTERING  CAMPUS AFTER SUSPENSION OR DISMISSAL',
       'HEATING VIOLATION  APT/HOTEL', 'VIN SWITCH',
       'HAZARDOUS MATERIALS, DUMP OIL INTO SEWERS',
       'BATHROOM HOLE, LOOKING THROUGH',
       'OPERATING WITHOUT DANCEHALL PERMIT',
       'GUIDE DOG, INTERFERING WITH', 'ESCAPES', 'DRIVING, DRAG RACING',
       'PERJURY', 'LOUDSPEAKER OR SOUND TRUCK PERMIT VIOLATION',
       'DRUG OFFENDER, PRESENCE  NEAR SCHOOL GROUNDS',
       'ESCAPE FROM HOSPITAL WITH FORCE', 'JUVENILE PAROLE VIOLATOR',
       'HAZARDOUS MATERIALS, DUMPING IN UNAUTHORIZED LOCATION',
       'POLICE BROADCAST, INTERCEPTION TO COMMIT CRIME',
       'DISPLAY & SALE OF SPRAY PAINT & MARKER PENS',
       'HAZARDOUS MATERIALS,  DUMP ANY SUBSTANCE INTO WATER',
       'DOG, FIGHTING;  OWNING, FIGHTING, OR ATTENDING FIGHT',
       'SOLICITING MINOR TO COMMIT FELONY', 'POISONING ANIMALS',
       'WEARING THE APPAREL OF OPPOSITE SEX TO DECEIVE',
       'HAZARDOUS MATERIALS, SPILL LOAD',
       'HAZARDOUS MATERIALS, FAILURE TO COMPLY W/REGULATIONS',
       'VIOLATION OF STATE LABOR CODE', 'ESCAPE FROM JAIL',
       'PUSH-CART PEDDLER PERMIT VIOLATION',
       'SALE OF SATELLITE TELEPHONE NUMBER',
       'ESCAPE OF PRISONER WHILE HOSPITALIZED',
       'INJURY TO RAILROADS/RAILROAD BRIDGES',
       'OPERATING WITHOUT CABARET PERMIT',
       'AFFIXING ADVERTISMENTS TO POLES',
       'ILLEGAL CHARITABLE SOLICITATIONS', 'OVERCHARGING TAXI FARE',
       'UNKNOWN COMPLAINT', 'UNAUTHORIZED USE OF LOUD SPEAKERS',
       'REFUSAL TO IDENTIFY'], dtype=object)
In [23]:
larceny_theft = train_reduced_categories.loc[train_reduced_categories['Category'] == 'LARCENY/THEFT']
larceny_theft
Out[23]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY34589...878002878023878036878045878046
2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday NORTHERN NONE 1500 Block of LOMBARD ST -122.426995 37.800873
2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday PARK NONE 100 Block of BRODERICK ST -122.438738 37.771541
2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO Wednesday INGLESIDE NONE 0 Block of TEDDY AV -122.403252 37.713431
2015-05-13 23:00:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday RICHMOND NONE 600 Block of 47TH AV -122.508194 37.776601
2015-05-13 23:00:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday CENTRAL NONE JEFFERSON ST / LEAVENWORTH ST -122.419088 37.807802
... ... ... ... ... ... ... ... ...
2003-01-06 06:00:00 LARCENY/THEFT PETTY THEFT FROM LOCKED AUTO Monday SOUTHERN NONE 5TH ST / HARRISON ST -122.401846 37.779032
2003-01-06 02:15:00 LARCENY/THEFT GRAND THEFT PICKPOCKET Monday TENDERLOIN NONE 600 Block of ELLIS ST -122.416894 37.784286
2003-01-06 00:55:00 LARCENY/THEFT PETTY THEFT SHOPLIFTING Monday NORTHERN NONE 1300 Block of WEBSTER ST -122.431046 37.783030
2003-01-06 00:01:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Monday INGLESIDE NONE 600 Block of EDNA ST -122.447364 37.731948
2003-01-06 00:01:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Monday SOUTHERN NONE 5TH ST / FOLSOM ST -122.403390 37.780266

174320 rows × 9 columns

In [24]:
larceny_theft['Descript'].unique()
Out[24]:
array(['GRAND THEFT FROM LOCKED AUTO', 'GRAND THEFT FROM UNLOCKED AUTO',
       'PETTY THEFT FROM LOCKED AUTO',
       'ATTEMPTED THEFT FROM LOCKED VEHICLE', 'PETTY THEFT SHOPLIFTING',
       'GRAND THEFT FROM PERSON', 'PETTY THEFT FROM A BUILDING',
       'GRAND THEFT OF PROPERTY', 'GRAND THEFT FROM A BUILDING',
       'PETTY THEFT OF PROPERTY', 'GRAND THEFT SHOPLIFTING',
       'THEFT OF ANIMALS (GENERAL)', 'PETTY THEFT BICYCLE',
       'GRAND THEFT BICYCLE', 'ATTEMPTED THEFT OF A BICYCLE',
       'ATTEMPTED GRAND THEFT PURSESNATCH',
       'THEFT OF COMPUTERS OR CELL PHONES', 'GRAND THEFT PICKPOCKET',
       'PETTY THEFT FROM UNLOCKED AUTO', 'THEFT FROM MERCHANT OR LIBRARY',
       'ATTEMPTED THEFT FROM A BUILDING',
       'THEFT OF CHECKS OR CREDIT CARDS', 'ATTEMPTED SHOPLIFTING',
       'LOST PROPERTY, PETTY THEFT',
       'ATTEMPTED THEFT FROM UNLOCKED VEHICLE',
       'THEFT, GRAND, OF FIREARM', 'GRAND THEFT AUTO STRIP',
       'ATTEMPTED GRAND THEFT FROM PERSON', 'THEFT, DRUNK ROLL, <$50',
       'PETTY THEFT AUTO STRIP', 'ATTEMPTED PETTY THEFT OF PROPERTY',
       'GRAND THEFT PURSESNATCH',
       'EMBEZZLEMENT FROM DEPENDENT OR ELDER ADULT BY CARETAKER',
       'THEFT, DRUNK ROLL, $50-$200', 'LOST PROPERTY, GRAND THEFT',
       'LICENSE PLATE OR TAB, THEFT OF', 'THEFT OF UTILITY SERVICES',
       'THEFT, BICYCLE, <$50, SERIAL NUMBER KNOWN',
       'THEFT, GRAND, BY FIDUCIARY, >$400 IN 12 MONTHS',
       'TRADE SECRETS, THEFT OR UNAUTHORIZED COPYING',
       'PETTY THEFT COIN OPERATED MACHINE', 'GRAND THEFT BY PROSTITUTE',
       'PETTY THEFT WITH PRIOR', 'THEFT OF WRITTEN INSTRUMENT',
       'PETTY THEFT MOTORCYCLE STRIP', 'THEFT, BOAT',
       'THEFT, DRUNK ROLL, $200-$400',
       'THEFT, BICYCLE, <$50, NO SERIAL NUMBER',
       'GRAND THEFT MOTORCYCLE STRIP', 'THEFT, DRUNK ROLL, >$400',
       'THEFT, GRAND, AGRICULTURAL', 'ATTEMPTED GRAND THEFT PICKPOCKET',
       'THEFT, ANIMAL, ATT.',
       'THEFT OF TELECOMMUNICATION SERVICES, INCL. CLONE PHONE',
       'GRAND THEFT COIN OPERATED MACHINE',
       'ATTEMPTED THEFT COIN OPERATED MACHINE', 'ATTEMPTED AUTO STRIP',
       'PETTY THEFT PHONE BOOTH', 'THEFT, DRUNK ROLL, ATT.',
       'ATTEMPTED MOTORCYCLE STRIP', 'GRAND THEFT PHONE BOOTH',
       'ATTEMPTED THEFT PHONE BOOTH'], dtype=object)
In [25]:
vehicle_theft = train_reduced_categories.loc[train_reduced_categories['Category'] == 'VEHICLE THEFT']
vehicle_theft
Out[25]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY67465960...877980877994877999878000878022
2015-05-13 23:30:00 VEHICLE THEFT STOLEN AUTOMOBILE Wednesday INGLESIDE NONE AVALON AV / PERU AV -122.423327 37.725138
2015-05-13 23:30:00 VEHICLE THEFT STOLEN AUTOMOBILE Wednesday BAYVIEW NONE KIRKWOOD AV / DONAHUE ST -122.371274 37.727564
2015-05-13 20:00:00 VEHICLE THEFT STOLEN MOTORCYCLE Wednesday INGLESIDE NONE 0 Block of CRESCENT AV -122.423702 37.735233
2015-05-13 19:28:00 VEHICLE THEFT STOLEN AND RECOVERED VEHICLE Wednesday CENTRAL NONE 0 Block of SANSOME ST -122.400720 37.790712
2015-05-13 19:28:00 VEHICLE THEFT STOLEN AUTOMOBILE Wednesday CENTRAL NONE 0 Block of SANSOME ST -122.400720 37.790712
... ... ... ... ... ... ... ... ...
2003-01-06 08:13:00 VEHICLE THEFT STOLEN MOTORCYCLE Monday TARAVAL ARREST, CITED JUNIPERO SERRA BL / HOLLOWAY AV -122.472322 37.721622
2003-01-06 07:20:00 VEHICLE THEFT VEHICLE, RECOVERED, AUTO Monday INGLESIDE NONE 400 Block of BENTON AV -122.418502 37.733945
2003-01-06 06:30:00 VEHICLE THEFT STOLEN TRUCK Monday BAYVIEW NONE 1400 Block of EGBERT AV -122.393819 37.723277
2003-01-06 06:30:00 VEHICLE THEFT VEHICLE, RECOVERED, OTHER VEHICLE Monday BAYVIEW NONE 1400 Block of EGBERT AV -122.393819 37.723277
2003-01-06 02:16:00 VEHICLE THEFT RECOVERED VEHICLE - STOLEN OUTSIDE SF Monday MISSION NONE 17TH ST / MISSION ST -122.419516 37.763429

53706 rows × 9 columns

In [26]:
vehicle_theft['Descript'].value_counts()
Out[26]:
STOLEN AUTOMOBILE                                  26866
STOLEN TRUCK                                        8578
VEHICLE, RECOVERED, AUTO                            8017
VEHICLE, RECOVERED, OTHER VEHICLE                   2576
STOLEN MOTORCYCLE                                   2320
STOLEN AND RECOVERED VEHICLE                        2257
RECOVERED VEHICLE - STOLEN OUTSIDE SF               1613
ATTEMPTED STOLEN VEHICLE                             503
VEHICLE, RECOVERED, MOTORCYCLE                       411
TAMPERING WITH A VEHICLE                             232
STOLEN MISCELLANEOUS VEHICLE                         162
STOLEN TRAILER                                        52
AUTO, GRAND THEFT OF                                  48
VEHICLE, RECOVERED, CAMPER-HOUSE CAR-MOTOR HOME       45
VEHICLE, RECOVERED, MOBILE HOME-TRAILER                9
STOLEN BUS                                             8
VEHICLE, RENTAL, FAILURE TO RETURN                     7
VEHICLE, RECOVERED, BUS                                2
Name: Descript, dtype: int64
In [27]:
fig, count = plt.subplots(figsize = (15,10))
count = sns.countplot(y = vehicle_theft['Descript'])
count.set_title('Vehicle Theft Descriptions Count')
Out[27]:
Text(0.5, 1.0, 'Vehicle Theft Descriptions Count')
In [28]:
fig, count = plt.subplots(figsize = (15,10))
count = sns.countplot(y = vehicle_theft['Descript'],order=vehicle_theft['Descript'].value_counts().index)
count.set_title('Vehicle Theft Descriptions Count with Descending')
Out[28]:
Text(0.5, 1.0, 'Vehicle Theft Descriptions Count with Descending')
In [29]:
vandalism = train_reduced_categories.loc[train_reduced_categories['Category'] == 'VANDALISM']
vandalism
Out[29]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY12323439126...878016878021878033878037878047
2015-05-13 22:30:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM OF VEHICLES Wednesday TENDERLOIN NONE TURK ST / JONES ST -122.412414 37.783004
2015-05-13 20:45:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM Wednesday NORTHERN NONE 1500 Block of FILLMORE ST -122.432744 37.783842
2015-05-13 20:30:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM Wednesday NORTHERN NONE 1600 Block of WEBSTER ST -122.431310 37.785871
2015-05-13 20:25:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM Wednesday INGLESIDE NONE 0 Block of WINDING WY -122.432446 37.710833
2015-05-13 17:07:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM Wednesday NORTHERN NONE 1100 Block of FILLMORE ST -122.431980 37.780048
... ... ... ... ... ... ... ... ...
2003-01-06 03:15:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM OF VEHICLES Monday BAYVIEW NONE 23RD ST / WISCONSIN ST -122.398696 37.754746
2003-01-06 02:24:00 VANDALISM MALICIOUS MISCHIEF Monday NORTHERN NOT PROSECUTED SANCHEZ ST / 14TH ST -122.431191 37.767595
2003-01-06 01:30:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM Monday RICHMOND ARREST, CITED 1000 Block of 22ND AV -122.391668 37.757793
2003-01-06 00:55:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM Monday NORTHERN NONE 1300 Block of WEBSTER ST -122.431046 37.783030
2003-01-06 00:01:00 VANDALISM MALICIOUS MISCHIEF, VANDALISM OF VEHICLES Monday SOUTHERN NONE TOWNSEND ST / 2ND ST -122.390531 37.780607

44581 rows × 9 columns

In [30]:
vandalism['Descript'].value_counts()
Out[30]:
MALICIOUS MISCHIEF, VANDALISM OF VEHICLES                   17717
MALICIOUS MISCHIEF, VANDALISM                               15918
MALICIOUS MISCHIEF, BREAKING WINDOWS                         4970
MALICIOUS MISCHIEF, GRAFFITI                                 3749
MALICIOUS MISCHIEF                                            475
MALICIOUS MISCHIEF, TIRE SLASHING                             300
MALICIOUS MISCHIEF, STREET CARS/BUSES                         273
VANDALISM OR GRAFFITI TOOLS, POSSESSION                       268
MALICIOUS MISCHIEF, BREAKING WINDOWS WITH BB GUN              260
MALICIOUS MISCHIEF, ADULT SUSPECT                             231
MALICIOUS MISCHIEF, JUVENILE SUSPECT                           86
GRAFFITI ON GOVERNMENT VEHICLES OR PUBLIC TRANSPORTATION       58
MALICIOUS MISCHIEF, BUILDING UNDER CONSTRUCTION                53
VANDALISM OR GRAFFITI ON OR WITHIN 100 FT OF HIGHWAY           52
MALICIOUS MISCHIEF, FICTITIOUS PHONE CALLS                     52
DAMAGE TO FIRE ALARM APPARATUS                                 32
AEROSOL CONTAINER; SALE, PURCHASE OR POSSESSION OF             24
DAMAGE/DESTRUCTION OF MAIL                                     17
ELECTRICAL  OR GAS LINES, INTERFERING WITH                     15
DAMAGE TO MAIL BOX                                             13
DAMAGE TO PARKING METERS                                        9
VANDALISM WITH NOXIOUS CHEMICAL                                 4
BALLOONS, ELECTRICALLY CONDUCTIVE                               2
DESTROYING JAIL PROPERTY-OVER $200                              2
DESTROYING JAIL PROPERTY-$200 OR UNDER                          1
Name: Descript, dtype: int64
In [31]:
fig, count = plt.subplots(figsize = (15,10))
count = sns.countplot(y = vandalism['Descript'])
count.set_title('Vandalism Descriptions Count')
Out[31]:
Text(0.5, 1.0, 'Vandalism Descriptions Count')
In [32]:
fig, count = plt.subplots(figsize = (15,10))
count = sns.countplot(y = vandalism['Descript'], order = vandalism['Descript'].value_counts().index)
count.set_title('Vandalism Descriptions Count with descending')
Out[32]:
Text(0.5, 1.0, 'Vandalism Descriptions Count with descending')
In [33]:
non_criminal = train_reduced_categories.loc[train_reduced_categories['Category'] == 'NON-CRIMINAL']
non_criminal
Out[33]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY1415192325...877927877939877955877976877990
2015-05-13 22:00:00 NON-CRIMINAL FOUND PROPERTY Wednesday BAYVIEW NONE 200 Block of WILLIAMS AV -122.397744 37.729935
2015-05-13 22:00:00 NON-CRIMINAL FOUND PROPERTY Wednesday BAYVIEW NONE 0 Block of MENDELL ST -122.383692 37.743189
2015-05-13 21:30:00 NON-CRIMINAL FOUND PROPERTY Wednesday TENDERLOIN NONE 100 Block of JONES ST -122.412250 37.782556
2015-05-13 21:11:00 NON-CRIMINAL STAY AWAY OR COURT ORDER, NON-DV RELATED Wednesday TENDERLOIN NONE 100 Block of JONES ST -122.412250 37.782556
2015-05-13 21:00:00 NON-CRIMINAL LOST PROPERTY Wednesday TENDERLOIN NONE 300 Block of OFARRELL ST -122.410509 37.786043
... ... ... ... ... ... ... ... ...
2003-01-06 10:40:00 NON-CRIMINAL FOUND PROPERTY Monday TARAVAL NONE 45TH AV / PACHECO ST -122.504071 37.749281
2003-01-06 10:15:00 NON-CRIMINAL FOUND PROPERTY Monday INGLESIDE NONE 100 Block of PEABODY ST -122.407445 37.710393
2003-01-06 09:27:00 NON-CRIMINAL AIDED CASE, MENTAL DISTURBED Monday SOUTHERN NONE 100 Block of BLUXOME ST -122.399064 37.775012
2003-01-06 08:29:00 NON-CRIMINAL AIDED CASE, MENTAL DISTURBED Monday CENTRAL PSYCHOPATHIC CASE 400 Block of BROADWAY ST -122.405065 37.798013
2003-01-06 07:40:00 NON-CRIMINAL AIDED CASE, MENTAL DISTURBED Monday NORTHERN NONE 2100 Block of GREENWICH ST -122.435072 37.799109

91915 rows × 9 columns

In [34]:
non_criminal['Descript'].value_counts()
Out[34]:
LOST PROPERTY                                         31498
AIDED CASE, MENTAL DISTURBED                          21488
FOUND PROPERTY                                        12078
AIDED CASE                                             5417
DEATH REPORT, CAUSE UNKNOWN                            4206
CASE CLOSURE                                           2257
STAY AWAY OR COURT ORDER, NON-DV RELATED               1653
AIDED CASE, DOG BITE                                   1336
CIVIL SIDEWALKS, CITATION                              1198
PROPERTY FOR IDENTIFICATION                            1196
AIDED CASE, INJURED PERSON                             1060
DEATH REPORT, NATURAL CAUSES                           1011
CIVIL SIDEWALKS, WARNING                                902
COURTESY REPORT                                         870
FIRE REPORT                                             794
AIDED CASE -PROPERTY FOR DESTRUCTION                    744
LOCATED PROPERTY                                        694
TARASOFF REPORT                                         671
SEARCH WARRANT SERVICE                                  539
TURNED IN GUN                                           494
TRAFFIC ACCIDENT                                        431
SHELTER                                                 384
IMPOUNDED VEHICLE                                       316
AIDED CASE, SICK PERSON                                 254
LICENSE PLATE, FOUND                                     97
MISPLACED VEHICLE                                        61
LICENSE PLATE, RECOVERED                                 57
CIVIL SIDEWALKS, BOOKING                                 41
CIVIL SIDEWALKS, VIOLATION                               37
TRUANT, HABITUAL                                         35
DEATH, ACCIDENTAL                                        19
DEATH, NON-MANSLAUGHTER AUTO ACCIDENT                    15
YOUTH COURT                                              15
ACCIDENTAL SHOOTING                                      13
DEMONSTRATION, VIDEO EVIDENCE, MISC. INVESTIGATION       11
MEGAN'S LAW NOTIFICATION                                 10
AUTO IMPOUNDED                                            6
DEATH REPORT, IN CUSTODY                                  5
ACCIDENTAL BURNS                                          2
Name: Descript, dtype: int64
In [35]:
fig, count = plt.subplots(figsize = (17,15))
count = sns.countplot(y = non_criminal['Descript'])
count.set_title('Non-Criminal Events Descriptions Count')
Out[35]:
Text(0.5, 1.0, 'Non-Criminal Events Descriptions Count')
In [36]:
fig, count = plt.subplots(figsize = (17,15))
count = sns.countplot(y = non_criminal['Descript'],order=non_criminal['Descript'].value_counts().index)
count.set_title('Non-Criminal Events Descriptions Count')
Out[36]:
Text(0.5, 1.0, 'Non-Criminal Events Descriptions Count')
In [37]:
assault = train_reduced_categories.loc[train_reduced_categories['Category'] == 'ASSAULT']
assault
Out[37]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY1751109114165...878028878031878035878040878042
2015-05-13 21:55:00 ASSAULT AGGRAVATED ASSAULT WITH BODILY FORCE Wednesday INGLESIDE NONE GODEUS ST / MISSION ST -122.421682 37.742822
2015-05-13 19:33:00 ASSAULT AGGRAVATED ASSAULT WITH BODILY FORCE Wednesday BAYVIEW NONE 23RD ST / WISCONSIN ST -122.398696 37.754746
2015-05-13 17:47:00 ASSAULT CHILD ABUSE (PHYSICAL) Wednesday BAYVIEW NONE 0 Block of WHITFIELD CT -122.381838 37.731104
2015-05-13 17:40:00 ASSAULT THREATS AGAINST LIFE Wednesday CENTRAL NONE 1400 Block of STOCKTON ST -122.409032 37.799253
2015-05-13 15:40:00 ASSAULT BATTERY, FORMER SPOUSE OR DATING RELATIONSHIP Wednesday PARK NONE 1700 Block of MCALLISTER ST -122.440880 37.777532
... ... ... ... ... ... ... ... ...
2003-01-06 02:00:00 ASSAULT AGGRAVATED ASSAULT WITH BODILY FORCE Monday SOUTHERN NONE 6TH ST / MARKET ST -122.410294 37.782231
2003-01-06 01:50:00 ASSAULT BATTERY Monday BAYVIEW NONE 3RD ST / NEWCOMB AV -122.390417 37.735593
2003-01-06 00:55:00 ASSAULT BATTERY Monday NORTHERN NONE 1300 Block of WEBSTER ST -122.431046 37.783030
2003-01-06 00:33:00 ASSAULT INFLICT INJURY ON COHABITEE Monday MISSION NONE 2800 Block of FOLSOM ST -122.414073 37.751685
2003-01-06 00:20:00 ASSAULT ATTEMPTED HOMICIDE WITH A GUN Monday BAYVIEW ARREST, BOOKED 1500 Block of SHAFTER AV -122.389769 37.730564

76815 rows × 9 columns

In [38]:
assault['Descript'].unique()
Out[38]:
array(['AGGRAVATED ASSAULT WITH BODILY FORCE', 'CHILD ABUSE (PHYSICAL)',
       'THREATS AGAINST LIFE',
       'BATTERY, FORMER SPOUSE OR DATING RELATIONSHIP',
       'SHOOTING INTO INHABITED DWELLING OR OCCUPIED VEHICLE', 'BATTERY',
       'AGGRAVATED ASSAULT WITH A DEADLY WEAPON',
       'AGGRAVATED ASSAULT WITH A KNIFE', 'INFLICT INJURY ON COHABITEE',
       'ASSAULT WITH CAUSTIC CHEMICALS', 'AGGRAVATED ASSAULT WITH A GUN',
       'ASSAULT, AGGRAVATED, W/ GUN',
       'ELDER ADULT OR DEPENDENT ABUSE (NOT EMBEZZLEMENT OR THEFT)',
       'TRESPASS WITHIN 30 DAYS OF CREDIBLE THREAT',
       'BATTERY OF A POLICE OFFICER', 'ATTEMPTED HOMICIDE WITH A GUN',
       'FALSE IMPRISONMENT', 'ASSAULT',
       'ATTEMPTED HOMICIDE WITH BODILY FORCE',
       'THREATENING PHONE CALL(S)',
       'FIREARM, DISCHARGING AT OCCUPIED BLDG, VEHICLE, OR AIRCRAFT',
       'ATTEMPTED HOMICIDE WITH A KNIFE',
       'THREAT OR FORCE TO RESIST EXECUTIVE OFFICER',
       'THREATS TO SCHOOL TEACHERS',
       'ATTEMPTED MAYHEM WITH A DEADLY WEAPON', 'STALKING',
       'BATTERY WITH SERIOUS INJURIES',
       'AGGRAVATED ASSAULT OF POLICE OFFICER,BODILY FORCE',
       'THREATENING SCHOOL OR PUBLIC EMPLOYEE',
       'ASSAULT ON A POLICE OFFICER WITH A DEADLY WEAPON',
       'ATTEMPTED SIMPLE ASSAULT', 'MAYHEM WITH A KNIFE',
       'UNLAWFUL DISSUADING/THREATENING OF A WITNESS',
       'RESISTING PEACE OFFICER, CAUSING THEIR SERIOUS INJURY OR DEATH',
       'DISCHARGING IN GROSSLY NEGLIGENT MANNER',
       'CHILD, INFLICTING INJURY RESULTING IN TRAUMATIC CONDITION',
       'ATTEMPTED MAYHEM WITH BODILY FORCE',
       'MAYHEM WITH A DEADLY WEAPON',
       'ATTEMPTED HOMICIDE WITH A DANGEROUS WEAPON',
       'FIREARM, DISCHARGING IN GROSSLY NEGLIGENT MANNER',
       'WILLFUL CRUELTY TO CHILD',
       'AGGRAVATED ASSAULT ON POLICE OFFICER WITH A KNIFE',
       'THREAT TO STATE OFFICIAL OR JUDGE', 'MAYHEM WITH BODILY FORCE',
       'MAYHEM WITH A GUN',
       'CIVIL RIGHTS, INCL. INJURY, THREAT, OR DAMAGE (HATE CRIMES)',
       'ASSAULT, AGGRAVATED, ON POLICE OFFICER, W/ GUN',
       'ASSAULT BY POLICE OFFICER',
       'LASERS, DISCHARGING OR LIGHTS AT AIRCRAFT',
       'ATTEMPTED MAYHEM WITH A KNIFE',
       "ASSAULT OR ATTEMPTED MURDER UPON GOV'T OFFICERS",
       'BATTERY DURING LABOR DISPUTE',
       'ASSAULT, AGGRAVATED, ON POLICE OFFICER, W/ SEMI AUTO',
       'ASSAULT, AGGRAVATED, W/ MACHINE GUN',
       'ASSAULT, AGGRAVATED, W/ SEMI AUTO', 'ASSAULT BY POISONING',
       'ASSAULT, AGGRAVATED, ON POLICE OFFICER, W/ FULL AUTO',
       'AGGRAVATED ASSAULT ON POLICE OFFICER WITH A GUN',
       'TERRORIZING BY ARSON OR EXPLOSIVE DEVICE',
       'TERRORIZING BY MARKING PRIVATE PROPERTY',
       'ATTEMPTED MAYHEM WITH A GUN',
       'AGGRAVATED ASSAULT OF POLICE OFFICER, SNIPING',
       'ATTEMPTED HOMICIDE WITH EXPLOSIVES'], dtype=object)
In [39]:
train_reduced_categories.loc[train_reduced_categories['Category'].isin(['LARCENY/THEFT', 'VEHICLE THEFT']), 'Category'] = 'THEFT'
train_reduced_categories['Category'].unique()
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py:1817: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
Out[39]:
array(['WARRANTS', 'OTHER OFFENSES', 'THEFT', 'VANDALISM', 'NON-CRIMINAL',
       'ASSAULT', 'BURGLARY', 'DRUG/NARCOTIC'], dtype=object)
In [40]:
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = train_reduced_categories['Category'])
count.set_title('Most common types of crimes')
Out[40]:
Text(0.5, 1.0, 'Most common types of crimes')
In [41]:
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = train_reduced_categories['Category'],order=train_reduced_categories['Category'].value_counts().index)
count.set_title('Most common types of crimes')
Out[41]:
Text(0.5, 1.0, 'Most common types of crimes')

["PdDistrict"] Column

In [42]:
train_clean = train_reduced_categories

구역 별 범죄량

In [43]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, count = plt.subplots(figsize = (12,10))
count = sns.countplot(x = train_clean['PdDistrict'])
count.set_title('Districts Count')
Out[43]:
Text(0.5, 1.0, 'Districts Count')
In [44]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, count = plt.subplots(figsize = (12,10))
count = sns.countplot(x = train_clean['PdDistrict'], order=train_clean['PdDistrict'].value_counts().index)
count.set_title('Districts Count')
Out[44]:
Text(0.5, 1.0, 'Districts Count')

지역 + 카테고리 크로스 데이터 정리

In [45]:
ct_district_cat = pd.crosstab(train_clean['Category'], train_clean['PdDistrict'])
ct_district_cat

# npp = np.array(ct_district_cat)
# sum = 0
# for i in range(0, npp.shape[0]):
#     for j in range(0, npp.shape[1]):
#         sum+= npp[i][j]
# sum
Out[45]:
PdDistrictBAYVIEWCENTRALINGLESIDEMISSIONNORTHERNPARKRICHMONDSOUTHERNTARAVALTENDERLOINCategoryASSAULTBURGLARYDRUG/NARCOTICNON-CRIMINALOTHER OFFENSESTHEFTVANDALISMWARRANTS
9845 6971 8522 11146 8312 3512 3198 12175 5460 7674
3914 4499 3327 3736 5827 2875 2683 4796 3459 1484
4496 1805 2372 8750 4508 2570 999 9222 1529 17668
6083 10923 6846 12344 10206 5886 5733 19548 6910 7436
17030 8893 13176 19308 12226 6180 5619 21245 8599 13684
17274 29181 19157 25335 34852 13073 13959 46382 17940 10873
5344 4451 5363 5273 5397 2603 3155 6533 4850 1612
4319 2776 2522 6605 4592 2308 1008 9083 1613 7319

stack()

In [46]:
stacked= ct_district_cat.stack().reset_index().rename(columns= {0:'value'})
fig, bar= plt.subplots(figsize= (15,10))
bar= sns.barplot(x= stacked['PdDistrict'], y= stacked['value'], hue= stacked['Category'])
bar.set_title('Categories Count per District')
Out[46]:
Text(0.5, 1.0, 'Categories Count per District')
In [47]:
fig, heatmap = plt.subplots(figsize = (15,10))
heatmap = sns.heatmap(ct_district_cat, annot= True, cmap= 'Reds')
heatmap.set_title('Categories X Districts')
Out[47]:
Text(0.5, 1.0, 'Categories X Districts')
In [48]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize=(19, 19))
    # 위도 경도 데이터를 바탕으로
    sns.scatterplot(data=train.iloc[:250000], x='X', y='Y', alpha=0.6, palette='rocket', hue='Category', size='Category') # 878049 
    # legend() => 범례 표시하기
    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')
In [ ]: