https://www.kaggle.com/code/emilyjiminroh/eda-visualization-san-francisco-notebook
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/san-francisco-crime-classification/train.csv
/kaggle/input/san-francisco-crime-classification/test.csv
In [2]:
# 데이터를 우선 가져와야합니다.
train = pd.read_csv('../input/san-francisco-crime-classification/train.csv')
train
Out[2]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY01234...878044878045878046878047878048
2015-05-13 23:53:00 | WARRANTS | WARRANT ARREST | Wednesday | NORTHERN | ARREST, BOOKED | OAK ST / LAGUNA ST | -122.425892 | 37.774599 |
2015-05-13 23:53:00 | OTHER OFFENSES | TRAFFIC VIOLATION ARREST | Wednesday | NORTHERN | ARREST, BOOKED | OAK ST / LAGUNA ST | -122.425892 | 37.774599 |
2015-05-13 23:33:00 | OTHER OFFENSES | TRAFFIC VIOLATION ARREST | Wednesday | NORTHERN | ARREST, BOOKED | VANNESS AV / GREENWICH ST | -122.424363 | 37.800414 |
2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | NORTHERN | NONE | 1500 Block of LOMBARD ST | -122.426995 | 37.800873 |
2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | PARK | NONE | 100 Block of BRODERICK ST | -122.438738 | 37.771541 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 00:15:00 | ROBBERY | ROBBERY ON THE STREET WITH A GUN | Monday | TARAVAL | NONE | FARALLONES ST / CAPITOL AV | -122.459033 | 37.714056 |
2003-01-06 00:01:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Monday | INGLESIDE | NONE | 600 Block of EDNA ST | -122.447364 | 37.731948 |
2003-01-06 00:01:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Monday | SOUTHERN | NONE | 5TH ST / FOLSOM ST | -122.403390 | 37.780266 |
2003-01-06 00:01:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM OF VEHICLES | Monday | SOUTHERN | NONE | TOWNSEND ST / 2ND ST | -122.390531 | 37.780607 |
2003-01-06 00:01:00 | FORGERY/COUNTERFEITING | CHECKS, FORGERY (FELONY) | Monday | BAYVIEW | NONE | 1800 Block of NEWCOMB AV | -122.394926 | 37.738212 |
878049 rows × 9 columns
In [3]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Dates 878049 non-null object
1 Category 878049 non-null object
2 Descript 878049 non-null object
3 DayOfWeek 878049 non-null object
4 PdDistrict 878049 non-null object
5 Resolution 878049 non-null object
6 Address 878049 non-null object
7 X 878049 non-null float64
8 Y 878049 non-null float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB
NULL 값 확인
In [4]:
# null 값이 존재하는가?
train.isnull().sum()
Out[4]:
Dates 0
Category 0
Descript 0
DayOfWeek 0
PdDistrict 0
Resolution 0
Address 0
X 0
Y 0
dtype: int64
중복 값 확인 후 처리
In [5]:
# 중복 값 확인
train.duplicated().sum()
Out[5]:
2323
In [6]:
#shape는 데이터의 구조를 알려준다. (row수와 column수)
shape_before = train.shape
shape_before
Out[6]:
(878049, 9)
In [7]:
# DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
train.drop_duplicates(subset=None,keep="first",inplace=True)
In [8]:
shape_after = train.shape
shape_after
Out[8]:
(875726, 9)
In [9]:
# 데이터 중복 처리 결과 확인
print('original_sample: ',shape_before,
'\nafter_drop_sample: ', shape_after,
'\nTotal_drop_Check: ',shape_before[0],'-',shape_after[0],
'=', shape_before[0]-shape_after[0],'Samples')
original_sample: (878049, 9)
after_drop_sample: (875726, 9)
Total_drop_Check: 878049 - 875726 = 2323 Samples
범죄 종류 당 횟수 출력
In [10]:
# 같은 카테고리를 가진 값들 개수 반환
pd.DataFrame(train['Category'].value_counts(ascending=False))
Out[10]:
CategoryLARCENY/THEFTOTHER OFFENSESNON-CRIMINALASSAULTDRUG/NARCOTICVEHICLE THEFTVANDALISMWARRANTSBURGLARYSUSPICIOUS OCCMISSING PERSONROBBERYFRAUDFORGERY/COUNTERFEITINGSECONDARY CODESWEAPON LAWSPROSTITUTIONTRESPASSSTOLEN PROPERTYSEX OFFENSES FORCIBLEDISORDERLY CONDUCTDRUNKENNESSRECOVERED VEHICLEKIDNAPPINGDRIVING UNDER THE INFLUENCELIQUOR LAWSRUNAWAYARSONLOITERINGEMBEZZLEMENTSUICIDEFAMILY OFFENSESBAD CHECKSBRIBERYEXTORTIONSEX OFFENSES NON FORCIBLEGAMBLINGPORNOGRAPHY/OBSCENE MATTREA
174320 |
125960 |
91915 |
76815 |
53919 |
53706 |
44581 |
42145 |
36600 |
31394 |
25669 |
22988 |
16637 |
10592 |
9979 |
8550 |
7446 |
7318 |
4537 |
4380 |
4313 |
4277 |
3132 |
2340 |
2268 |
1899 |
1894 |
1512 |
1207 |
1164 |
508 |
488 |
406 |
289 |
256 |
148 |
146 |
22 |
6 |
범죄 종류 당 횟수 시각화
In [11]:
# 시각화 with matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# subplots 하나의 그래프에 여러 값들 겹쳐서 나타내기
# plt.subplot(row,column,index)
fig, count = plt.subplots(figsize=(17,15))
# countplot: 각 카데고리 값 별로 데이터가 얼마나 있는지 표시
count = sns.countplot(y=train['Category'])
count.set_title('Crime counts')
# 내림차순으로 보이게 하려면??
# order 인자 추가 아래 코드!!
Out[11]:
Text(0.5, 1.0, 'Crime counts')
내림차순 정렬
In [12]:
fig, count = plt.subplots(figsize=(17,15))
count = sns.countplot(y=train['Category'], order = train['Category'].value_counts().index)
count.set_title('Crime counts with decending')
Out[12]:
Text(0.5, 1.0, 'Crime counts with decending')
범죄 종류 별 빈도 파악
In [13]:
# frequency랑 누적_frequency의 차이점은?
# frequency는 고유의 빈도수
# cumulative_freq는 이전 고유 빈도수의 누적 + 현재의 누적 값
# 고유 행수가 포함된 열 개수반환, normalize 비율 반환
category_freq = pd.DataFrame(train['Category'].value_counts(normalize=True))
# index 다시 생성
category_freq.reset_index(inplace=True)
# index -> Category, Category -> frequency
category_freq.rename({'index': 'Category', 'Category': 'Frequency'}, axis='columns', inplace=True)
# 누적 합계를 포함하는 동일한 크기의 DataFrame 반환
category_freq['Cumulative_Frequency'] = category_freq['Frequency'].cumsum()
category_freq
Out[13]:
CategoryFrequencyCumulative_Frequency01234567891011121314151617181920212223242526272829303132333435363738
LARCENY/THEFT | 0.199058 | 0.199058 |
OTHER OFFENSES | 0.143835 | 0.342893 |
NON-CRIMINAL | 0.104959 | 0.447851 |
ASSAULT | 0.087716 | 0.535567 |
DRUG/NARCOTIC | 0.061571 | 0.597138 |
VEHICLE THEFT | 0.061327 | 0.658465 |
VANDALISM | 0.050907 | 0.709373 |
WARRANTS | 0.048126 | 0.757498 |
BURGLARY | 0.041794 | 0.799292 |
SUSPICIOUS OCC | 0.035849 | 0.835141 |
MISSING PERSON | 0.029312 | 0.864453 |
ROBBERY | 0.026250 | 0.890703 |
FRAUD | 0.018998 | 0.909701 |
FORGERY/COUNTERFEITING | 0.012095 | 0.921796 |
SECONDARY CODES | 0.011395 | 0.933191 |
WEAPON LAWS | 0.009763 | 0.942955 |
PROSTITUTION | 0.008503 | 0.951457 |
TRESPASS | 0.008356 | 0.959814 |
STOLEN PROPERTY | 0.005181 | 0.964995 |
SEX OFFENSES FORCIBLE | 0.005002 | 0.969996 |
DISORDERLY CONDUCT | 0.004925 | 0.974921 |
DRUNKENNESS | 0.004884 | 0.979805 |
RECOVERED VEHICLE | 0.003576 | 0.983382 |
KIDNAPPING | 0.002672 | 0.986054 |
DRIVING UNDER THE INFLUENCE | 0.002590 | 0.988644 |
LIQUOR LAWS | 0.002168 | 0.990812 |
RUNAWAY | 0.002163 | 0.992975 |
ARSON | 0.001727 | 0.994702 |
LOITERING | 0.001378 | 0.996080 |
EMBEZZLEMENT | 0.001329 | 0.997409 |
SUICIDE | 0.000580 | 0.997989 |
FAMILY OFFENSES | 0.000557 | 0.998546 |
BAD CHECKS | 0.000464 | 0.999010 |
BRIBERY | 0.000330 | 0.999340 |
EXTORTION | 0.000292 | 0.999632 |
SEX OFFENSES NON FORCIBLE | 0.000169 | 0.999801 |
GAMBLING | 0.000167 | 0.999968 |
PORNOGRAPHY/OBSCENE MAT | 0.000025 | 0.999993 |
TREA | 0.000007 | 1.000000 |
약 80%의 범죄 유형 데이터 정리
In [14]:
# 누적 frequency가 0.8 이하이전의 범죄 유형 리스트 만들기
candidate_classes = list(category_freq.loc[category_freq['Cumulative_Frequency'] < 0.8, 'Category'])
candidate_classes
Out[14]:
['LARCENY/THEFT',
'OTHER OFFENSES',
'NON-CRIMINAL',
'ASSAULT',
'DRUG/NARCOTIC',
'VEHICLE THEFT',
'VANDALISM',
'WARRANTS',
'BURGLARY']
In [15]:
# 데이터 프레임 슬라이싱
# 위에서 만든 리스트에 해당되는 데이터 행 불러오기
train_reduced_categories = train.loc[train['Category'].isin(candidate_classes)]
train_reduced_categories
Out[15]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY01234...878042878043878045878046878047
2015-05-13 23:53:00 | WARRANTS | WARRANT ARREST | Wednesday | NORTHERN | ARREST, BOOKED | OAK ST / LAGUNA ST | -122.425892 | 37.774599 |
2015-05-13 23:53:00 | OTHER OFFENSES | TRAFFIC VIOLATION ARREST | Wednesday | NORTHERN | ARREST, BOOKED | OAK ST / LAGUNA ST | -122.425892 | 37.774599 |
2015-05-13 23:33:00 | OTHER OFFENSES | TRAFFIC VIOLATION ARREST | Wednesday | NORTHERN | ARREST, BOOKED | VANNESS AV / GREENWICH ST | -122.424363 | 37.800414 |
2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | NORTHERN | NONE | 1500 Block of LOMBARD ST | -122.426995 | 37.800873 |
2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | PARK | NONE | 100 Block of BRODERICK ST | -122.438738 | 37.771541 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 00:20:00 | ASSAULT | ATTEMPTED HOMICIDE WITH A GUN | Monday | BAYVIEW | ARREST, BOOKED | 1500 Block of SHAFTER AV | -122.389769 | 37.730564 |
2003-01-06 00:20:00 | OTHER OFFENSES | PAROLE VIOLATION | Monday | BAYVIEW | ARREST, BOOKED | 1500 Block of SHAFTER AV | -122.389769 | 37.730564 |
2003-01-06 00:01:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Monday | INGLESIDE | NONE | 600 Block of EDNA ST | -122.447364 | 37.731948 |
2003-01-06 00:01:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Monday | SOUTHERN | NONE | 5TH ST / FOLSOM ST | -122.403390 | 37.780266 |
2003-01-06 00:01:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM OF VEHICLES | Monday | SOUTHERN | NONE | TOWNSEND ST / 2ND ST | -122.390531 | 37.780607 |
699961 rows × 9 columns
누적 80%를 차지하는 범죄유형 시각화
In [16]:
# 내림차순 정렬은 countplot에서 oreder라는 인자 추가해서 조정하면 된다!
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = train_reduced_categories['Category'],order=train_reduced_categories['Category'].value_counts().index)
count.set_title('Most common types of crimes with descending')
Out[16]:
Text(0.5, 1.0, 'Most common types of crimes with descending')
In [17]:
warrants = train_reduced_categories.loc[train_reduced_categories['Category'] == 'WARRANTS']
warrants
Out[17]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY022666775...877968877995878026878027878038
2015-05-13 23:53:00 | WARRANTS | WARRANT ARREST | Wednesday | NORTHERN | ARREST, BOOKED | OAK ST / LAGUNA ST | -122.425892 | 37.774599 |
2015-05-13 21:11:00 | WARRANTS | WARRANT ARREST | Wednesday | TENDERLOIN | NONE | 100 Block of JONES ST | -122.412250 | 37.782556 |
2015-05-13 19:11:00 | WARRANTS | ENROUTE TO PAROLE OFFICER | Wednesday | TENDERLOIN | NONE | 400 Block of HYDE ST | -122.416236 | 37.784913 |
2015-05-13 19:11:00 | WARRANTS | ENROUTE TO OUTSIDE JURISDICTION | Wednesday | TENDERLOIN | NONE | 400 Block of HYDE ST | -122.416236 | 37.784913 |
2015-05-13 18:58:00 | WARRANTS | ENROUTE TO OUTSIDE JURISDICTION | Wednesday | MISSION | NONE | 1900 Block of MISSION ST | -122.419677 | 37.765851 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 08:45:00 | WARRANTS | ENROUTE TO OUTSIDE JURISDICTION | Monday | SOUTHERN | ARREST, BOOKED | 0 Block of UNITED NATIONS PZ | -122.414318 | 37.779944 |
2003-01-06 07:08:00 | WARRANTS | ENROUTE TO OUTSIDE JURISDICTION | Monday | MISSION | ARREST, BOOKED | 3000 Block of 16TH ST | -122.421663 | 37.764876 |
2003-01-06 02:06:00 | WARRANTS | WARRANT ARREST | Monday | BAYVIEW | ARREST, BOOKED | NEWHALL ST / GALVEZ AV | -122.387710 | 37.740674 |
2003-01-06 02:00:00 | WARRANTS | ENROUTE TO OUTSIDE JURISDICTION | Monday | SOUTHERN | ARREST, BOOKED | 900 Block of MARKET ST | -122.409708 | 37.782828 |
2003-01-06 00:42:00 | WARRANTS | ENROUTE TO OUTSIDE JURISDICTION | Monday | TENDERLOIN | ARREST, BOOKED | TAYLOR ST / GEARY ST | -122.411519 | 37.786941 |
42145 rows × 9 columns
In [18]:
warrants['Descript'].value_counts()
Out[18]:
WARRANT ARREST 23702
ENROUTE TO OUTSIDE JURISDICTION 11456
ENROUTE TO DEPARTMENT OF CORRECTIONS 3245
ENROUTE TO PAROLE OFFICER 2149
ENROUTE TO ADULT AUTHORITY 884
PROBATION SEARCH 207
ENROUTE TO PROBATION OFFICER 207
ENROUTE TO U.S. MARSHALL 178
POST RELEASE COMMUNITY SUPERVISION 83
PAROLE SEARCH 34
Name: Descript, dtype: int64
In [19]:
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = warrants['Descript'])
count.set_title('Warrant Descriptions Counts')
Out[19]:
Text(0.5, 1.0, 'Warrant Descriptions Counts')
In [20]:
# 내림차순 정렬
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = warrants['Descript'],order=warrants['Descript'].value_counts().index)
count.set_title('Warrant Descriptions Counts with descending')
Out[20]:
Text(0.5, 1.0, 'Warrant Descriptions Counts with descending')
In [21]:
others = train_reduced_categories.loc[train_reduced_categories['Category'] == 'OTHER OFFENSES']
others
Out[21]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY12111829...878029878032878039878041878043
2015-05-13 23:53:00 | OTHER OFFENSES | TRAFFIC VIOLATION ARREST | Wednesday | NORTHERN | ARREST, BOOKED | OAK ST / LAGUNA ST | -122.425892 | 37.774599 |
2015-05-13 23:33:00 | OTHER OFFENSES | TRAFFIC VIOLATION ARREST | Wednesday | NORTHERN | ARREST, BOOKED | VANNESS AV / GREENWICH ST | -122.424363 | 37.800414 |
2015-05-13 22:30:00 | OTHER OFFENSES | MISCELLANEOUS INVESTIGATION | Wednesday | TARAVAL | NONE | 0 Block of ESCOLTA WY | -122.487983 | 37.737667 |
2015-05-13 21:40:00 | OTHER OFFENSES | TRAFFIC VIOLATION | Wednesday | BAYVIEW | ARREST, BOOKED | MENDELL ST / HUDSON AV | -122.386401 | 37.738983 |
2015-05-13 20:56:00 | OTHER OFFENSES | MISCELLANEOUS INVESTIGATION | Wednesday | TARAVAL | NONE | 2000 Block of 41ST AV | -122.499787 | 37.748518 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 01:54:00 | OTHER OFFENSES | PROBATION VIOLATION | Monday | TENDERLOIN | ARREST, BOOKED | 1400 Block of GOLDEN GATE AV | -122.434423 | 37.779193 |
2003-01-06 01:36:00 | OTHER OFFENSES | DRIVERS LICENSE, SUSPENDED OR REVOKED | Monday | NORTHERN | ARREST, CITED | GEARY BL / FRANKLIN ST | -122.423031 | 37.785482 |
2003-01-06 00:40:00 | OTHER OFFENSES | DRIVERS LICENSE, SUSPENDED OR REVOKED | Monday | NORTHERN | ARREST, CITED | POLK ST / CALIFORNIA ST | -122.420692 | 37.790577 |
2003-01-06 00:31:00 | OTHER OFFENSES | DRIVERS LICENSE, SUSPENDED OR REVOKED | Monday | RICHMOND | ARREST, CITED | CLEMENT ST / 14TH AV | -122.472985 | 37.782552 |
2003-01-06 00:20:00 | OTHER OFFENSES | PAROLE VIOLATION | Monday | BAYVIEW | ARREST, BOOKED | 1500 Block of SHAFTER AV | -122.389769 | 37.730564 |
125960 rows × 9 columns
In [22]:
others['Descript'].unique()
Out[22]:
array(['TRAFFIC VIOLATION ARREST', 'MISCELLANEOUS INVESTIGATION',
'TRAFFIC VIOLATION', 'PROBATION VIOLATION',
'STAY AWAY ORDER VIOLATION, DV RELATED',
'DRIVERS LICENSE, SUSPENDED OR REVOKED', 'RESISTING ARREST',
'LOST/STOLEN LICENSE PLATE',
'FRAUDULENT GAME OR TRICK, OBTAINING MONEY OR PROPERTY',
'RECKLESS DRIVING', 'VIOLATION OF RESTRAINING ORDER',
'VIOLATION OF MUNICIPAL CODE', 'PAROLE VIOLATION',
'HARASSING PHONE CALLS', 'INCIDENT ON SCHOOL GROUNDS',
'VIOLATION OF PARK CODE', 'POSSESSION OF BURGLARY TOOLS W/PRIORS',
'CONSPIRACY',
'RESTRAINING ORDER NOTIFICATION/SERVICE OF RESTRAINING ORDER',
'TAMPERING WITH A VEHICLE', 'VIOLATION OF MUNICIPAL POLICE CODE',
'FALSE PERSONATION', 'POSSESSION OF BURGLARY TOOLS',
'FALSE PERSONATION TO RECEIVE MONEY OR PROPERTY',
'TRAFFIC COLLISION, HIT & RUN, INJURY', 'OBSCENE PHONE CALLS(S)',
'EVADING A POLICE OFFICER RECKLESSLY', 'DEFRAUDING TAXI DRIVER',
'PERMIT VIOLATION, POLICE (GENERAL)',
'FALSE EVIDENCE OF VEHICLE REGISTRATION', 'FALSE FIRE ALARM',
'DOG, STRAY OR VICIOUS', 'INJURY TO TELEGRAPH/TELEPHONE LINES',
'TRAFFIC COLLISION, HIT & RUN, PROPERTY DAMAGE',
'FAILURE TO REGISTER AS SEX OFFENDER',
'PROBATION VIOLATION, DV RELATED',
'VEHICLE ALARM CODE GRABBING DEVICE, POSSESS OR USE',
'OBSTRUCTIONS ON STREETS/SIDEWALKS',
'VIOLATION OF EMERGENCY PROTECTIVE ORDER', 'INDECENT EXPOSURE',
'VIOLATION OF STAY AWAY ORDER', 'MISCELLANEOUS STATE FELONY',
'MISCELLANEOUS STATE MISDEMEANOR', 'CRUELTY TO ANIMALS',
'SPEEDING', 'SCHOOL, PUBLIC, TRESPASS',
'DRIVING, RECKLESS, WITH INJURY',
'OPEN CONTAINER OF ALCOHOL IN VEHICLE',
'SCHOOL PROPERTY, DISTURBANCE ON',
'FALSE PERSONATION AND CHEAT CRIMES (GENERAL)', 'CURFEW VIOLATION',
'DESTITUTE MINOR', 'BEYOND PARENTAL CONTROL',
'FIREARMS, SEIZING AT SCENE OF DV', 'ANIMAL, FIGHTING',
'DANGER OF LEADING IMMORAL LIFE',
'DRIVES VEHICLE ALONG TRACK OF RAILROAD', 'PHONE CALLS, OBSCENE',
'VIOLATION OF FIRE CODE',
'PHONE CALLS IN VIOLATION OF DV COURT ORDER',
'VIN, ALTER OR REMOVE', 'EVADING A POLICE VEHICLE OR BICYCLE',
'CONTRIBUTING TO THE DELINQUENCY OF MINOR',
'FALSE REPORT OF CRIME',
'MONEY, PROPERTY OR LABOR, FRAUDULENTLY OBTAINING',
'THROWING INJURIOUS SUBSTANCE ON HIGHWAY',
'TOBACCO PRODUCTS, SELLING OR FURNISHING TO MINOR',
'VEHICLE, DISABLED PLACARD VIOLATION',
'PUBLIC TRANSIT CRIMES - INFRACTIONS',
'EVADING PAYMENT OF RAILROAD FARE', 'INTOXICATED JUVENILE',
'POSSESSION OF ARTICLES WITH IDENTIFICATION REMOVE',
'INDECENT EXPOSURE (JUVENILE VICTIM)',
'INTERFERRING WITH A FIREMAN', 'THROWING SUBSTANCE AT VEHICLE',
'MINOR PURCHASING OR RECEIVING TOBACCO PRODUCT',
'VIOLATION OF FEDERAL STATUTE',
'OTHER OFFENSES AGAINST PUBLIC JUSTICE',
'PEDDLING WITHOUT A LICENSE',
'FAILURE TO HEED RED LIGHT AND SIREN',
'PERMIT VIOLATION, ENTERTAINMENT',
'INDECENT EXPOSURE WITH PRIOR CONVICTION',
'INTERFERRING WITH A POLICE OFFICER',
'COMPUTER SYSTEM, ACCESSING, COPYING, OR DAMAGING',
'PLACING TRASH ON THE STREET', 'SPITTING ON SIDEWALK',
'ACTS AGAINST PUBLIC TRANSIT',
'HAZARDOUS MATERIALS, SPILL ON ROADWAY',
'OPERATING TAXI WITHOUT A PERMIT', 'FALSE REPORT OF BOMB',
'THROWING OBJECT AT COMMON CARRIER, PASSENGER OR FREIGHT',
'DUMPING OF OFFENSIVE MATTER', 'PERMIT VIOLATION, VALET PARKING',
'JUDGE/JUROR ACCEPTING A BRIBE',
'WEARING MASK OR DISGUISE FOR UNLAWFUL PURPOSE',
'POSSESSION OF FIRECRACKERS', 'AGGRESSIVE SOLICITING',
'HABITUAL TRUANT', 'ESCAPE OR ASSISTING ESCAPE',
'TAKING CONTRABAND INTO A REFORMATORY',
'DISRUPTS SCHOOL ACTIVITIES', 'PHONE CALLS, HARASSING, TO 911',
'RIOT', 'DOG, BARKING', 'SOLICITING COMMISSION OF A CRIME',
'DOG OR CAT, ABANDONMENT OF',
'EVADING A POLICE OFFICER, INJURY OR DEATH',
'FIREWORKS, THROW AT PERSON OR DISCHARGE IN CROWD',
'LYNCHING BY RIOT', 'PEEPING TOM', 'ABORTION',
'AID OR HARBOR FELON', 'ANIMAL, WITHOUT PROPER CARE OR ATTENTION',
'LICENSE PLATE, STOLEN', 'SCALPING TICKETS',
'FOOD STAMPS, MISUSE OF',
'INSURED PROPERTY, DESTRUCTION TO DEFRAUD INSURER',
'MASSAGE ESTABLISHMENT PERMIT VIOLATION',
'DISCHARGING OFFENSIVE OR INJURIOUS SUBSTANCE IN PUBLIC AREA',
'VIOLATION OF CIVIL GANG INJUNCTION',
'PERMIT VIOLATION, SIDEWALK SALES',
'SELLING/DISCHARGING OF FIRECRACKERS',
'ADVERTISING DISTRIBUTORS PERMIT VIOLATION',
'HAZARDOUS MATERIALS, DUMPING IN UNAUTHORIZED LOCATN',
'RESCUING PRISONER FROM LAWFUL CUSTODY',
'VIOLATION OF CALIF UNEMPLOYMENT INSURANCE ACT',
'HAZARDOUS SUBSTANCES, DEPOSITING',
'SCHOOL GROUNDS, ENTRY BY SEX OFFENDER', 'ESCAPEE, JUVENILE',
'TAMPERING WITH MAIL', 'SELLING RESTRICTED GLUE TO JUVENILES',
'FORTUNE TELLING',
'SCHOOL STUDENT OR EMPLOYEE ENTERING CAMPUS AFTER SUSPENSION OR DISMISSAL',
'HEATING VIOLATION APT/HOTEL', 'VIN SWITCH',
'HAZARDOUS MATERIALS, DUMP OIL INTO SEWERS',
'BATHROOM HOLE, LOOKING THROUGH',
'OPERATING WITHOUT DANCEHALL PERMIT',
'GUIDE DOG, INTERFERING WITH', 'ESCAPES', 'DRIVING, DRAG RACING',
'PERJURY', 'LOUDSPEAKER OR SOUND TRUCK PERMIT VIOLATION',
'DRUG OFFENDER, PRESENCE NEAR SCHOOL GROUNDS',
'ESCAPE FROM HOSPITAL WITH FORCE', 'JUVENILE PAROLE VIOLATOR',
'HAZARDOUS MATERIALS, DUMPING IN UNAUTHORIZED LOCATION',
'POLICE BROADCAST, INTERCEPTION TO COMMIT CRIME',
'DISPLAY & SALE OF SPRAY PAINT & MARKER PENS',
'HAZARDOUS MATERIALS, DUMP ANY SUBSTANCE INTO WATER',
'DOG, FIGHTING; OWNING, FIGHTING, OR ATTENDING FIGHT',
'SOLICITING MINOR TO COMMIT FELONY', 'POISONING ANIMALS',
'WEARING THE APPAREL OF OPPOSITE SEX TO DECEIVE',
'HAZARDOUS MATERIALS, SPILL LOAD',
'HAZARDOUS MATERIALS, FAILURE TO COMPLY W/REGULATIONS',
'VIOLATION OF STATE LABOR CODE', 'ESCAPE FROM JAIL',
'PUSH-CART PEDDLER PERMIT VIOLATION',
'SALE OF SATELLITE TELEPHONE NUMBER',
'ESCAPE OF PRISONER WHILE HOSPITALIZED',
'INJURY TO RAILROADS/RAILROAD BRIDGES',
'OPERATING WITHOUT CABARET PERMIT',
'AFFIXING ADVERTISMENTS TO POLES',
'ILLEGAL CHARITABLE SOLICITATIONS', 'OVERCHARGING TAXI FARE',
'UNKNOWN COMPLAINT', 'UNAUTHORIZED USE OF LOUD SPEAKERS',
'REFUSAL TO IDENTIFY'], dtype=object)
In [23]:
larceny_theft = train_reduced_categories.loc[train_reduced_categories['Category'] == 'LARCENY/THEFT']
larceny_theft
Out[23]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY34589...878002878023878036878045878046
2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | NORTHERN | NONE | 1500 Block of LOMBARD ST | -122.426995 | 37.800873 |
2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | PARK | NONE | 100 Block of BRODERICK ST | -122.438738 | 37.771541 |
2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM UNLOCKED AUTO | Wednesday | INGLESIDE | NONE | 0 Block of TEDDY AV | -122.403252 | 37.713431 |
2015-05-13 23:00:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | RICHMOND | NONE | 600 Block of 47TH AV | -122.508194 | 37.776601 |
2015-05-13 23:00:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | CENTRAL | NONE | JEFFERSON ST / LEAVENWORTH ST | -122.419088 | 37.807802 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 06:00:00 | LARCENY/THEFT | PETTY THEFT FROM LOCKED AUTO | Monday | SOUTHERN | NONE | 5TH ST / HARRISON ST | -122.401846 | 37.779032 |
2003-01-06 02:15:00 | LARCENY/THEFT | GRAND THEFT PICKPOCKET | Monday | TENDERLOIN | NONE | 600 Block of ELLIS ST | -122.416894 | 37.784286 |
2003-01-06 00:55:00 | LARCENY/THEFT | PETTY THEFT SHOPLIFTING | Monday | NORTHERN | NONE | 1300 Block of WEBSTER ST | -122.431046 | 37.783030 |
2003-01-06 00:01:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Monday | INGLESIDE | NONE | 600 Block of EDNA ST | -122.447364 | 37.731948 |
2003-01-06 00:01:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Monday | SOUTHERN | NONE | 5TH ST / FOLSOM ST | -122.403390 | 37.780266 |
174320 rows × 9 columns
In [24]:
larceny_theft['Descript'].unique()
Out[24]:
array(['GRAND THEFT FROM LOCKED AUTO', 'GRAND THEFT FROM UNLOCKED AUTO',
'PETTY THEFT FROM LOCKED AUTO',
'ATTEMPTED THEFT FROM LOCKED VEHICLE', 'PETTY THEFT SHOPLIFTING',
'GRAND THEFT FROM PERSON', 'PETTY THEFT FROM A BUILDING',
'GRAND THEFT OF PROPERTY', 'GRAND THEFT FROM A BUILDING',
'PETTY THEFT OF PROPERTY', 'GRAND THEFT SHOPLIFTING',
'THEFT OF ANIMALS (GENERAL)', 'PETTY THEFT BICYCLE',
'GRAND THEFT BICYCLE', 'ATTEMPTED THEFT OF A BICYCLE',
'ATTEMPTED GRAND THEFT PURSESNATCH',
'THEFT OF COMPUTERS OR CELL PHONES', 'GRAND THEFT PICKPOCKET',
'PETTY THEFT FROM UNLOCKED AUTO', 'THEFT FROM MERCHANT OR LIBRARY',
'ATTEMPTED THEFT FROM A BUILDING',
'THEFT OF CHECKS OR CREDIT CARDS', 'ATTEMPTED SHOPLIFTING',
'LOST PROPERTY, PETTY THEFT',
'ATTEMPTED THEFT FROM UNLOCKED VEHICLE',
'THEFT, GRAND, OF FIREARM', 'GRAND THEFT AUTO STRIP',
'ATTEMPTED GRAND THEFT FROM PERSON', 'THEFT, DRUNK ROLL, <$50',
'PETTY THEFT AUTO STRIP', 'ATTEMPTED PETTY THEFT OF PROPERTY',
'GRAND THEFT PURSESNATCH',
'EMBEZZLEMENT FROM DEPENDENT OR ELDER ADULT BY CARETAKER',
'THEFT, DRUNK ROLL, $50-$200', 'LOST PROPERTY, GRAND THEFT',
'LICENSE PLATE OR TAB, THEFT OF', 'THEFT OF UTILITY SERVICES',
'THEFT, BICYCLE, <$50, SERIAL NUMBER KNOWN',
'THEFT, GRAND, BY FIDUCIARY, >$400 IN 12 MONTHS',
'TRADE SECRETS, THEFT OR UNAUTHORIZED COPYING',
'PETTY THEFT COIN OPERATED MACHINE', 'GRAND THEFT BY PROSTITUTE',
'PETTY THEFT WITH PRIOR', 'THEFT OF WRITTEN INSTRUMENT',
'PETTY THEFT MOTORCYCLE STRIP', 'THEFT, BOAT',
'THEFT, DRUNK ROLL, $200-$400',
'THEFT, BICYCLE, <$50, NO SERIAL NUMBER',
'GRAND THEFT MOTORCYCLE STRIP', 'THEFT, DRUNK ROLL, >$400',
'THEFT, GRAND, AGRICULTURAL', 'ATTEMPTED GRAND THEFT PICKPOCKET',
'THEFT, ANIMAL, ATT.',
'THEFT OF TELECOMMUNICATION SERVICES, INCL. CLONE PHONE',
'GRAND THEFT COIN OPERATED MACHINE',
'ATTEMPTED THEFT COIN OPERATED MACHINE', 'ATTEMPTED AUTO STRIP',
'PETTY THEFT PHONE BOOTH', 'THEFT, DRUNK ROLL, ATT.',
'ATTEMPTED MOTORCYCLE STRIP', 'GRAND THEFT PHONE BOOTH',
'ATTEMPTED THEFT PHONE BOOTH'], dtype=object)
In [25]:
vehicle_theft = train_reduced_categories.loc[train_reduced_categories['Category'] == 'VEHICLE THEFT']
vehicle_theft
Out[25]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY67465960...877980877994877999878000878022
2015-05-13 23:30:00 | VEHICLE THEFT | STOLEN AUTOMOBILE | Wednesday | INGLESIDE | NONE | AVALON AV / PERU AV | -122.423327 | 37.725138 |
2015-05-13 23:30:00 | VEHICLE THEFT | STOLEN AUTOMOBILE | Wednesday | BAYVIEW | NONE | KIRKWOOD AV / DONAHUE ST | -122.371274 | 37.727564 |
2015-05-13 20:00:00 | VEHICLE THEFT | STOLEN MOTORCYCLE | Wednesday | INGLESIDE | NONE | 0 Block of CRESCENT AV | -122.423702 | 37.735233 |
2015-05-13 19:28:00 | VEHICLE THEFT | STOLEN AND RECOVERED VEHICLE | Wednesday | CENTRAL | NONE | 0 Block of SANSOME ST | -122.400720 | 37.790712 |
2015-05-13 19:28:00 | VEHICLE THEFT | STOLEN AUTOMOBILE | Wednesday | CENTRAL | NONE | 0 Block of SANSOME ST | -122.400720 | 37.790712 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 08:13:00 | VEHICLE THEFT | STOLEN MOTORCYCLE | Monday | TARAVAL | ARREST, CITED | JUNIPERO SERRA BL / HOLLOWAY AV | -122.472322 | 37.721622 |
2003-01-06 07:20:00 | VEHICLE THEFT | VEHICLE, RECOVERED, AUTO | Monday | INGLESIDE | NONE | 400 Block of BENTON AV | -122.418502 | 37.733945 |
2003-01-06 06:30:00 | VEHICLE THEFT | STOLEN TRUCK | Monday | BAYVIEW | NONE | 1400 Block of EGBERT AV | -122.393819 | 37.723277 |
2003-01-06 06:30:00 | VEHICLE THEFT | VEHICLE, RECOVERED, OTHER VEHICLE | Monday | BAYVIEW | NONE | 1400 Block of EGBERT AV | -122.393819 | 37.723277 |
2003-01-06 02:16:00 | VEHICLE THEFT | RECOVERED VEHICLE - STOLEN OUTSIDE SF | Monday | MISSION | NONE | 17TH ST / MISSION ST | -122.419516 | 37.763429 |
53706 rows × 9 columns
In [26]:
vehicle_theft['Descript'].value_counts()
Out[26]:
STOLEN AUTOMOBILE 26866
STOLEN TRUCK 8578
VEHICLE, RECOVERED, AUTO 8017
VEHICLE, RECOVERED, OTHER VEHICLE 2576
STOLEN MOTORCYCLE 2320
STOLEN AND RECOVERED VEHICLE 2257
RECOVERED VEHICLE - STOLEN OUTSIDE SF 1613
ATTEMPTED STOLEN VEHICLE 503
VEHICLE, RECOVERED, MOTORCYCLE 411
TAMPERING WITH A VEHICLE 232
STOLEN MISCELLANEOUS VEHICLE 162
STOLEN TRAILER 52
AUTO, GRAND THEFT OF 48
VEHICLE, RECOVERED, CAMPER-HOUSE CAR-MOTOR HOME 45
VEHICLE, RECOVERED, MOBILE HOME-TRAILER 9
STOLEN BUS 8
VEHICLE, RENTAL, FAILURE TO RETURN 7
VEHICLE, RECOVERED, BUS 2
Name: Descript, dtype: int64
In [27]:
fig, count = plt.subplots(figsize = (15,10))
count = sns.countplot(y = vehicle_theft['Descript'])
count.set_title('Vehicle Theft Descriptions Count')
Out[27]:
Text(0.5, 1.0, 'Vehicle Theft Descriptions Count')
In [28]:
fig, count = plt.subplots(figsize = (15,10))
count = sns.countplot(y = vehicle_theft['Descript'],order=vehicle_theft['Descript'].value_counts().index)
count.set_title('Vehicle Theft Descriptions Count with Descending')
Out[28]:
Text(0.5, 1.0, 'Vehicle Theft Descriptions Count with Descending')
In [29]:
vandalism = train_reduced_categories.loc[train_reduced_categories['Category'] == 'VANDALISM']
vandalism
Out[29]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY12323439126...878016878021878033878037878047
2015-05-13 22:30:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM OF VEHICLES | Wednesday | TENDERLOIN | NONE | TURK ST / JONES ST | -122.412414 | 37.783004 |
2015-05-13 20:45:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM | Wednesday | NORTHERN | NONE | 1500 Block of FILLMORE ST | -122.432744 | 37.783842 |
2015-05-13 20:30:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM | Wednesday | NORTHERN | NONE | 1600 Block of WEBSTER ST | -122.431310 | 37.785871 |
2015-05-13 20:25:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM | Wednesday | INGLESIDE | NONE | 0 Block of WINDING WY | -122.432446 | 37.710833 |
2015-05-13 17:07:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM | Wednesday | NORTHERN | NONE | 1100 Block of FILLMORE ST | -122.431980 | 37.780048 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 03:15:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM OF VEHICLES | Monday | BAYVIEW | NONE | 23RD ST / WISCONSIN ST | -122.398696 | 37.754746 |
2003-01-06 02:24:00 | VANDALISM | MALICIOUS MISCHIEF | Monday | NORTHERN | NOT PROSECUTED | SANCHEZ ST / 14TH ST | -122.431191 | 37.767595 |
2003-01-06 01:30:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM | Monday | RICHMOND | ARREST, CITED | 1000 Block of 22ND AV | -122.391668 | 37.757793 |
2003-01-06 00:55:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM | Monday | NORTHERN | NONE | 1300 Block of WEBSTER ST | -122.431046 | 37.783030 |
2003-01-06 00:01:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM OF VEHICLES | Monday | SOUTHERN | NONE | TOWNSEND ST / 2ND ST | -122.390531 | 37.780607 |
44581 rows × 9 columns
In [30]:
vandalism['Descript'].value_counts()
Out[30]:
MALICIOUS MISCHIEF, VANDALISM OF VEHICLES 17717
MALICIOUS MISCHIEF, VANDALISM 15918
MALICIOUS MISCHIEF, BREAKING WINDOWS 4970
MALICIOUS MISCHIEF, GRAFFITI 3749
MALICIOUS MISCHIEF 475
MALICIOUS MISCHIEF, TIRE SLASHING 300
MALICIOUS MISCHIEF, STREET CARS/BUSES 273
VANDALISM OR GRAFFITI TOOLS, POSSESSION 268
MALICIOUS MISCHIEF, BREAKING WINDOWS WITH BB GUN 260
MALICIOUS MISCHIEF, ADULT SUSPECT 231
MALICIOUS MISCHIEF, JUVENILE SUSPECT 86
GRAFFITI ON GOVERNMENT VEHICLES OR PUBLIC TRANSPORTATION 58
MALICIOUS MISCHIEF, BUILDING UNDER CONSTRUCTION 53
VANDALISM OR GRAFFITI ON OR WITHIN 100 FT OF HIGHWAY 52
MALICIOUS MISCHIEF, FICTITIOUS PHONE CALLS 52
DAMAGE TO FIRE ALARM APPARATUS 32
AEROSOL CONTAINER; SALE, PURCHASE OR POSSESSION OF 24
DAMAGE/DESTRUCTION OF MAIL 17
ELECTRICAL OR GAS LINES, INTERFERING WITH 15
DAMAGE TO MAIL BOX 13
DAMAGE TO PARKING METERS 9
VANDALISM WITH NOXIOUS CHEMICAL 4
BALLOONS, ELECTRICALLY CONDUCTIVE 2
DESTROYING JAIL PROPERTY-OVER $200 2
DESTROYING JAIL PROPERTY-$200 OR UNDER 1
Name: Descript, dtype: int64
In [31]:
fig, count = plt.subplots(figsize = (15,10))
count = sns.countplot(y = vandalism['Descript'])
count.set_title('Vandalism Descriptions Count')
Out[31]:
Text(0.5, 1.0, 'Vandalism Descriptions Count')
In [32]:
fig, count = plt.subplots(figsize = (15,10))
count = sns.countplot(y = vandalism['Descript'], order = vandalism['Descript'].value_counts().index)
count.set_title('Vandalism Descriptions Count with descending')
Out[32]:
Text(0.5, 1.0, 'Vandalism Descriptions Count with descending')
In [33]:
non_criminal = train_reduced_categories.loc[train_reduced_categories['Category'] == 'NON-CRIMINAL']
non_criminal
Out[33]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY1415192325...877927877939877955877976877990
2015-05-13 22:00:00 | NON-CRIMINAL | FOUND PROPERTY | Wednesday | BAYVIEW | NONE | 200 Block of WILLIAMS AV | -122.397744 | 37.729935 |
2015-05-13 22:00:00 | NON-CRIMINAL | FOUND PROPERTY | Wednesday | BAYVIEW | NONE | 0 Block of MENDELL ST | -122.383692 | 37.743189 |
2015-05-13 21:30:00 | NON-CRIMINAL | FOUND PROPERTY | Wednesday | TENDERLOIN | NONE | 100 Block of JONES ST | -122.412250 | 37.782556 |
2015-05-13 21:11:00 | NON-CRIMINAL | STAY AWAY OR COURT ORDER, NON-DV RELATED | Wednesday | TENDERLOIN | NONE | 100 Block of JONES ST | -122.412250 | 37.782556 |
2015-05-13 21:00:00 | NON-CRIMINAL | LOST PROPERTY | Wednesday | TENDERLOIN | NONE | 300 Block of OFARRELL ST | -122.410509 | 37.786043 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 10:40:00 | NON-CRIMINAL | FOUND PROPERTY | Monday | TARAVAL | NONE | 45TH AV / PACHECO ST | -122.504071 | 37.749281 |
2003-01-06 10:15:00 | NON-CRIMINAL | FOUND PROPERTY | Monday | INGLESIDE | NONE | 100 Block of PEABODY ST | -122.407445 | 37.710393 |
2003-01-06 09:27:00 | NON-CRIMINAL | AIDED CASE, MENTAL DISTURBED | Monday | SOUTHERN | NONE | 100 Block of BLUXOME ST | -122.399064 | 37.775012 |
2003-01-06 08:29:00 | NON-CRIMINAL | AIDED CASE, MENTAL DISTURBED | Monday | CENTRAL | PSYCHOPATHIC CASE | 400 Block of BROADWAY ST | -122.405065 | 37.798013 |
2003-01-06 07:40:00 | NON-CRIMINAL | AIDED CASE, MENTAL DISTURBED | Monday | NORTHERN | NONE | 2100 Block of GREENWICH ST | -122.435072 | 37.799109 |
91915 rows × 9 columns
In [34]:
non_criminal['Descript'].value_counts()
Out[34]:
LOST PROPERTY 31498
AIDED CASE, MENTAL DISTURBED 21488
FOUND PROPERTY 12078
AIDED CASE 5417
DEATH REPORT, CAUSE UNKNOWN 4206
CASE CLOSURE 2257
STAY AWAY OR COURT ORDER, NON-DV RELATED 1653
AIDED CASE, DOG BITE 1336
CIVIL SIDEWALKS, CITATION 1198
PROPERTY FOR IDENTIFICATION 1196
AIDED CASE, INJURED PERSON 1060
DEATH REPORT, NATURAL CAUSES 1011
CIVIL SIDEWALKS, WARNING 902
COURTESY REPORT 870
FIRE REPORT 794
AIDED CASE -PROPERTY FOR DESTRUCTION 744
LOCATED PROPERTY 694
TARASOFF REPORT 671
SEARCH WARRANT SERVICE 539
TURNED IN GUN 494
TRAFFIC ACCIDENT 431
SHELTER 384
IMPOUNDED VEHICLE 316
AIDED CASE, SICK PERSON 254
LICENSE PLATE, FOUND 97
MISPLACED VEHICLE 61
LICENSE PLATE, RECOVERED 57
CIVIL SIDEWALKS, BOOKING 41
CIVIL SIDEWALKS, VIOLATION 37
TRUANT, HABITUAL 35
DEATH, ACCIDENTAL 19
DEATH, NON-MANSLAUGHTER AUTO ACCIDENT 15
YOUTH COURT 15
ACCIDENTAL SHOOTING 13
DEMONSTRATION, VIDEO EVIDENCE, MISC. INVESTIGATION 11
MEGAN'S LAW NOTIFICATION 10
AUTO IMPOUNDED 6
DEATH REPORT, IN CUSTODY 5
ACCIDENTAL BURNS 2
Name: Descript, dtype: int64
In [35]:
fig, count = plt.subplots(figsize = (17,15))
count = sns.countplot(y = non_criminal['Descript'])
count.set_title('Non-Criminal Events Descriptions Count')
Out[35]:
Text(0.5, 1.0, 'Non-Criminal Events Descriptions Count')
In [36]:
fig, count = plt.subplots(figsize = (17,15))
count = sns.countplot(y = non_criminal['Descript'],order=non_criminal['Descript'].value_counts().index)
count.set_title('Non-Criminal Events Descriptions Count')
Out[36]:
Text(0.5, 1.0, 'Non-Criminal Events Descriptions Count')
In [37]:
assault = train_reduced_categories.loc[train_reduced_categories['Category'] == 'ASSAULT']
assault
Out[37]:
DatesCategoryDescriptDayOfWeekPdDistrictResolutionAddressXY1751109114165...878028878031878035878040878042
2015-05-13 21:55:00 | ASSAULT | AGGRAVATED ASSAULT WITH BODILY FORCE | Wednesday | INGLESIDE | NONE | GODEUS ST / MISSION ST | -122.421682 | 37.742822 |
2015-05-13 19:33:00 | ASSAULT | AGGRAVATED ASSAULT WITH BODILY FORCE | Wednesday | BAYVIEW | NONE | 23RD ST / WISCONSIN ST | -122.398696 | 37.754746 |
2015-05-13 17:47:00 | ASSAULT | CHILD ABUSE (PHYSICAL) | Wednesday | BAYVIEW | NONE | 0 Block of WHITFIELD CT | -122.381838 | 37.731104 |
2015-05-13 17:40:00 | ASSAULT | THREATS AGAINST LIFE | Wednesday | CENTRAL | NONE | 1400 Block of STOCKTON ST | -122.409032 | 37.799253 |
2015-05-13 15:40:00 | ASSAULT | BATTERY, FORMER SPOUSE OR DATING RELATIONSHIP | Wednesday | PARK | NONE | 1700 Block of MCALLISTER ST | -122.440880 | 37.777532 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2003-01-06 02:00:00 | ASSAULT | AGGRAVATED ASSAULT WITH BODILY FORCE | Monday | SOUTHERN | NONE | 6TH ST / MARKET ST | -122.410294 | 37.782231 |
2003-01-06 01:50:00 | ASSAULT | BATTERY | Monday | BAYVIEW | NONE | 3RD ST / NEWCOMB AV | -122.390417 | 37.735593 |
2003-01-06 00:55:00 | ASSAULT | BATTERY | Monday | NORTHERN | NONE | 1300 Block of WEBSTER ST | -122.431046 | 37.783030 |
2003-01-06 00:33:00 | ASSAULT | INFLICT INJURY ON COHABITEE | Monday | MISSION | NONE | 2800 Block of FOLSOM ST | -122.414073 | 37.751685 |
2003-01-06 00:20:00 | ASSAULT | ATTEMPTED HOMICIDE WITH A GUN | Monday | BAYVIEW | ARREST, BOOKED | 1500 Block of SHAFTER AV | -122.389769 | 37.730564 |
76815 rows × 9 columns
In [38]:
assault['Descript'].unique()
Out[38]:
array(['AGGRAVATED ASSAULT WITH BODILY FORCE', 'CHILD ABUSE (PHYSICAL)',
'THREATS AGAINST LIFE',
'BATTERY, FORMER SPOUSE OR DATING RELATIONSHIP',
'SHOOTING INTO INHABITED DWELLING OR OCCUPIED VEHICLE', 'BATTERY',
'AGGRAVATED ASSAULT WITH A DEADLY WEAPON',
'AGGRAVATED ASSAULT WITH A KNIFE', 'INFLICT INJURY ON COHABITEE',
'ASSAULT WITH CAUSTIC CHEMICALS', 'AGGRAVATED ASSAULT WITH A GUN',
'ASSAULT, AGGRAVATED, W/ GUN',
'ELDER ADULT OR DEPENDENT ABUSE (NOT EMBEZZLEMENT OR THEFT)',
'TRESPASS WITHIN 30 DAYS OF CREDIBLE THREAT',
'BATTERY OF A POLICE OFFICER', 'ATTEMPTED HOMICIDE WITH A GUN',
'FALSE IMPRISONMENT', 'ASSAULT',
'ATTEMPTED HOMICIDE WITH BODILY FORCE',
'THREATENING PHONE CALL(S)',
'FIREARM, DISCHARGING AT OCCUPIED BLDG, VEHICLE, OR AIRCRAFT',
'ATTEMPTED HOMICIDE WITH A KNIFE',
'THREAT OR FORCE TO RESIST EXECUTIVE OFFICER',
'THREATS TO SCHOOL TEACHERS',
'ATTEMPTED MAYHEM WITH A DEADLY WEAPON', 'STALKING',
'BATTERY WITH SERIOUS INJURIES',
'AGGRAVATED ASSAULT OF POLICE OFFICER,BODILY FORCE',
'THREATENING SCHOOL OR PUBLIC EMPLOYEE',
'ASSAULT ON A POLICE OFFICER WITH A DEADLY WEAPON',
'ATTEMPTED SIMPLE ASSAULT', 'MAYHEM WITH A KNIFE',
'UNLAWFUL DISSUADING/THREATENING OF A WITNESS',
'RESISTING PEACE OFFICER, CAUSING THEIR SERIOUS INJURY OR DEATH',
'DISCHARGING IN GROSSLY NEGLIGENT MANNER',
'CHILD, INFLICTING INJURY RESULTING IN TRAUMATIC CONDITION',
'ATTEMPTED MAYHEM WITH BODILY FORCE',
'MAYHEM WITH A DEADLY WEAPON',
'ATTEMPTED HOMICIDE WITH A DANGEROUS WEAPON',
'FIREARM, DISCHARGING IN GROSSLY NEGLIGENT MANNER',
'WILLFUL CRUELTY TO CHILD',
'AGGRAVATED ASSAULT ON POLICE OFFICER WITH A KNIFE',
'THREAT TO STATE OFFICIAL OR JUDGE', 'MAYHEM WITH BODILY FORCE',
'MAYHEM WITH A GUN',
'CIVIL RIGHTS, INCL. INJURY, THREAT, OR DAMAGE (HATE CRIMES)',
'ASSAULT, AGGRAVATED, ON POLICE OFFICER, W/ GUN',
'ASSAULT BY POLICE OFFICER',
'LASERS, DISCHARGING OR LIGHTS AT AIRCRAFT',
'ATTEMPTED MAYHEM WITH A KNIFE',
"ASSAULT OR ATTEMPTED MURDER UPON GOV'T OFFICERS",
'BATTERY DURING LABOR DISPUTE',
'ASSAULT, AGGRAVATED, ON POLICE OFFICER, W/ SEMI AUTO',
'ASSAULT, AGGRAVATED, W/ MACHINE GUN',
'ASSAULT, AGGRAVATED, W/ SEMI AUTO', 'ASSAULT BY POISONING',
'ASSAULT, AGGRAVATED, ON POLICE OFFICER, W/ FULL AUTO',
'AGGRAVATED ASSAULT ON POLICE OFFICER WITH A GUN',
'TERRORIZING BY ARSON OR EXPLOSIVE DEVICE',
'TERRORIZING BY MARKING PRIVATE PROPERTY',
'ATTEMPTED MAYHEM WITH A GUN',
'AGGRAVATED ASSAULT OF POLICE OFFICER, SNIPING',
'ATTEMPTED HOMICIDE WITH EXPLOSIVES'], dtype=object)
In [39]:
train_reduced_categories.loc[train_reduced_categories['Category'].isin(['LARCENY/THEFT', 'VEHICLE THEFT']), 'Category'] = 'THEFT'
train_reduced_categories['Category'].unique()
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py:1817: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._setitem_single_column(loc, value, pi)
Out[39]:
array(['WARRANTS', 'OTHER OFFENSES', 'THEFT', 'VANDALISM', 'NON-CRIMINAL',
'ASSAULT', 'BURGLARY', 'DRUG/NARCOTIC'], dtype=object)
In [40]:
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = train_reduced_categories['Category'])
count.set_title('Most common types of crimes')
Out[40]:
Text(0.5, 1.0, 'Most common types of crimes')
In [41]:
fig, count = plt.subplots(figsize = (15,5))
count = sns.countplot(y = train_reduced_categories['Category'],order=train_reduced_categories['Category'].value_counts().index)
count.set_title('Most common types of crimes')
Out[41]:
Text(0.5, 1.0, 'Most common types of crimes')
["PdDistrict"] Column
In [42]:
train_clean = train_reduced_categories
구역 별 범죄량
In [43]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, count = plt.subplots(figsize = (12,10))
count = sns.countplot(x = train_clean['PdDistrict'])
count.set_title('Districts Count')
Out[43]:
Text(0.5, 1.0, 'Districts Count')
In [44]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, count = plt.subplots(figsize = (12,10))
count = sns.countplot(x = train_clean['PdDistrict'], order=train_clean['PdDistrict'].value_counts().index)
count.set_title('Districts Count')
Out[44]:
Text(0.5, 1.0, 'Districts Count')
지역 + 카테고리 크로스 데이터 정리
In [45]:
ct_district_cat = pd.crosstab(train_clean['Category'], train_clean['PdDistrict'])
ct_district_cat
# npp = np.array(ct_district_cat)
# sum = 0
# for i in range(0, npp.shape[0]):
# for j in range(0, npp.shape[1]):
# sum+= npp[i][j]
# sum
Out[45]:
PdDistrictBAYVIEWCENTRALINGLESIDEMISSIONNORTHERNPARKRICHMONDSOUTHERNTARAVALTENDERLOINCategoryASSAULTBURGLARYDRUG/NARCOTICNON-CRIMINALOTHER OFFENSESTHEFTVANDALISMWARRANTS
9845 | 6971 | 8522 | 11146 | 8312 | 3512 | 3198 | 12175 | 5460 | 7674 |
3914 | 4499 | 3327 | 3736 | 5827 | 2875 | 2683 | 4796 | 3459 | 1484 |
4496 | 1805 | 2372 | 8750 | 4508 | 2570 | 999 | 9222 | 1529 | 17668 |
6083 | 10923 | 6846 | 12344 | 10206 | 5886 | 5733 | 19548 | 6910 | 7436 |
17030 | 8893 | 13176 | 19308 | 12226 | 6180 | 5619 | 21245 | 8599 | 13684 |
17274 | 29181 | 19157 | 25335 | 34852 | 13073 | 13959 | 46382 | 17940 | 10873 |
5344 | 4451 | 5363 | 5273 | 5397 | 2603 | 3155 | 6533 | 4850 | 1612 |
4319 | 2776 | 2522 | 6605 | 4592 | 2308 | 1008 | 9083 | 1613 | 7319 |
stack()
In [46]:
stacked= ct_district_cat.stack().reset_index().rename(columns= {0:'value'})
fig, bar= plt.subplots(figsize= (15,10))
bar= sns.barplot(x= stacked['PdDistrict'], y= stacked['value'], hue= stacked['Category'])
bar.set_title('Categories Count per District')
Out[46]:
Text(0.5, 1.0, 'Categories Count per District')
In [47]:
fig, heatmap = plt.subplots(figsize = (15,10))
heatmap = sns.heatmap(ct_district_cat, annot= True, cmap= 'Reds')
heatmap.set_title('Categories X Districts')
Out[47]:
Text(0.5, 1.0, 'Categories X Districts')
In [48]:
with plt.style.context('fivethirtyeight'):
fig, ax = plt.subplots(1, 1, figsize=(19, 19))
# 위도 경도 데이터를 바탕으로
sns.scatterplot(data=train.iloc[:250000], x='X', y='Y', alpha=0.6, palette='rocket', hue='Category', size='Category') # 878049
# legend() => 범례 표시하기
plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')
In [ ]:
'AI & Data Analysis > Kaggle Notebook' 카테고리의 다른 글
[NLP] SMS Spam Collection Dataset (0) | 2022.06.07 |
---|---|
[CNN] Traffic Signs Classification with Explanation (0) | 2022.05.27 |
[EDA & Visualization] Netflix Dataset (0) | 2022.05.27 |
[EDA] Home-credit-default-risk Dataset (0) | 2022.05.27 |
[CNN] Fashion-Mnist-Data (0) | 2022.03.31 |