The objective of this notebook is to take the the first 5 digits of CEP and find the geographical coordinate.
To make the best use of this notebook I suggest following it by running it in your machine.
pwd()
!source susep_env/bin/activate
Some of the packages below are complicated to install.
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re
We use the table
arq_casco3_compfrom the site:
http://www2.susep.gov.br/menuestatistica/Autoseg/principal.aspx
Table arq_casco3_comp
have the key: Categoria Tarifária/CEP/Modelo/Ano.
That it category/CEP/car's model/year.
We use only some lines of the table, so it can fit in my machine's memory.
The following are the variables of the table.
var_names = ['Unnamed', 'COD_TARIF', 'REGIAO', 'COD_MODELO',
'ANO_MODELO', 'CEP', 'EXPOSICAO', 'PREMIO', 'FREQ_SIN1', 'INDENIZ1',
'FREQ_SIN23', 'INDENIZ23', 'FREQ_SIN4', 'INDENIZ4', 'FREQ_SIN9',
'INDENIZ9', 'ENVIO']
Below df_chunk.csv
are some lines from the table of the 1st semester of 2019 of arq_casco3_comp
.
df_test = pd.read_csv('df_chunck.csv',skiprows=20, nrows=10,dtype=str,names=var_names)
list_cep = list(df_test['CEP'])
The following function have an input which is a list of CEPs and it outputs a tuple, which are: lat, lon, description of local.
jj = list_cep[1]
jj = "71539-070"
def add_coord_cep(list_cep):
sufix_options = ["-000","-005","-020","-025"]
len1 = len(sufix_options)
list_lat = list()
list_lon = list()
list_local = list()
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
for jj in list_cep:
print(jj)
root_text = "https://www.google.com/maps/place/Brazil+"
cep_text = jj
url_text = root_text+cep_text
driver.get(url_text)
content = driver.page_source
soup = BeautifulSoup(content)
hold_script = soup.find_all('script')
hold_s2 = str(hold_script)
postal_exist = re.search('Postal', hold_s2) #test if Google Maps found something
if postal_exist:
print('Google Maps found something.')
found_question = True
else:
print('Google Maps did not found anything.')
found_question = False
#try to add -000 in CEP see if Google Maps find something
ii = 0
while found_question==False:
if ii == len1:
break
sufix = sufix_options[ii]
#root_text = "https://www.google.com/maps/place/Brazil+"
cep_text2 = cep_text+sufix
url_text = root_text+cep_text2
driver.get(url_text)
content = driver.page_source
soup = BeautifulSoup(content)
hold_script = soup.find_all('script')
hold_s2 = str(hold_script)
postal_exist = re.search('Postal', hold_s2) #test if Google Maps found something
if postal_exist:
print('Google Maps found something.')
found_question = True
else:
print('Google Maps did not found anything.')
ii += 1
if found_question:
example_text = hold_s2
match = re.search(r'MAA', example_text)
pos1 = match.span()[0]
hold1 = example_text[pos1:(pos1+200)]
hold1 = hold1.replace("\\","")
result = re.search('\["(.*)"\]n,null,', hold1)
hold1_temp = result.group(1)
hold1_temp = hold1_temp.replace("\"","")
result_verify = re.search('null', hold1_temp)
while result_verify: #this while eliminates all string that we don't want
result_2 = re.search('(.*)\]n,null,', hold1_temp)
hold1_temp = result_2.group(1)
result_verify = re.search('null', hold1_temp)
hold1_local = hold1_temp
result = re.search('\[null,null,(.*)]n,"', hold1)
str_lat_lon = result.group(1)
result2 = re.search('(.*),-', str_lat_lon)
result2_1 = result2.group(1) #lat
result3 = re.search(',(.*)', str_lat_lon)
result3_1 = result3.group(1) #lon
list_lat.append(float(result2_1))
list_lon.append(float(result3_1))
list_local.append(str(hold1_local))
else:
print('We did not found any value for this CEP.')
list_lat.append(None)
list_lon.append(None)
list_local.append(None)
return list_lat, list_lon, list_local
output_coords = add_coord_cep(list_cep)
output_coords[0]
output_coords[1]
output_coords[2]
len(output_coords[2])