Process statistics
This commit is contained in:
parent
aa2eebabba
commit
f84b6a1367
1 changed files with 91 additions and 1 deletions
92
create_db.py
92
create_db.py
|
@ -4,6 +4,8 @@ import io
|
|||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
import argparse as arg
|
||||
|
||||
|
@ -120,6 +122,38 @@ def import_statistics_csv(raw_file):
|
|||
sep=';')
|
||||
return stats
|
||||
|
||||
def get_single_date(attr):
|
||||
logger.debug('get a date from {}'.format(attr))
|
||||
m = re.match('^[D,P]([0-9]{2}).*$', attr)
|
||||
if m.group(1) is None:
|
||||
logger.error('Cand determine single date from {}'.format(attr))
|
||||
|
||||
if m.group(1).startswith(tuple(['0','1','2'])):
|
||||
return 2000 + int(m.group(1)), 'null'
|
||||
else:
|
||||
return 1900 + int(m.group(1)), 'null'
|
||||
|
||||
def get_range_date(attr):
|
||||
logger.debug('get date range from {}'.format(attr))
|
||||
m = re.match('^[A-Z]*([0-9]{2})([0-9]{2}).*$', attr)
|
||||
date = {}
|
||||
try:
|
||||
for i in [1,2]:
|
||||
logger.debug('Process two digits: {}'.format(m.group(i)))
|
||||
if m.group(i).startswith(tuple(['0','1','2'])):
|
||||
date[i] = 2000 + int(m.group(i))
|
||||
else:
|
||||
date[i] = 1900 + int(m.group(i))
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
'Error when trying to determine daterange from {} - {}'.format(
|
||||
attr,
|
||||
e
|
||||
)
|
||||
)
|
||||
return None, None
|
||||
|
||||
return date[1], date[2]
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
|
@ -175,7 +209,7 @@ if __name__ == '__main__':
|
|||
'residences principales',
|
||||
'residences secondaires et logements occasionnels'
|
||||
],
|
||||
'code': ['_POP','NAIS', 'DECES','_LOG', '_LOGVAC', '_RP', '_RSECOCC']},
|
||||
'code': ['_POP','NAIS', 'DECE','_LOG', '_LOGVAC', '_RP', '_RSECOCC']},
|
||||
index=[1,2,3,4,5,6,7])
|
||||
logger.debug(indicators)
|
||||
|
||||
|
@ -190,5 +224,61 @@ if __name__ == '__main__':
|
|||
states_capitals.columns = ["CHEFLIEUREG","REG"]
|
||||
departments = departments[["REG","NCC","LIBELLE"]]
|
||||
logger.debug(states_capitals)
|
||||
|
||||
## create statistics dataframes
|
||||
#
|
||||
# We need to first iterate on statistics
|
||||
if args.verbose or arg.debug:
|
||||
t_begin = time.time()
|
||||
logger.info('BEGIN - import stats')
|
||||
|
||||
c_stats = pd.DataFrame(columns = ['com','id_indicateur','date_debut',
|
||||
'date_fin','valeur']
|
||||
)
|
||||
temp = {"com" : [], "id_indicateur" : [],
|
||||
"date_debut" : [],
|
||||
"date_fin" : [],
|
||||
"valeur" : []
|
||||
}
|
||||
for s_index,srow in statistics.iterrows():
|
||||
for index, irow in indicators.iterrows():
|
||||
if irow['code'].startswith('_'):
|
||||
regex = irow['code'] + '$'
|
||||
else:
|
||||
regex = '^' + irow['code']
|
||||
|
||||
logger.debug('Process indicator {}'.format(regex))
|
||||
selection = srow.filter(regex=regex)
|
||||
for attribute, value in selection.items():
|
||||
logger.debug('check code: {}'.format(irow['code']))
|
||||
if irow['code'].startswith('_'):
|
||||
start,end = get_single_date(attribute)
|
||||
else:
|
||||
start,end = get_range_date(attribute)
|
||||
|
||||
if start is None or end is None:
|
||||
logger.error('Can\'t process line, continue to next')
|
||||
continue
|
||||
|
||||
logger.debug(
|
||||
'town:{}, id_indic: {}, start: {}, end: {}, value:{}'
|
||||
.format(
|
||||
srow['CODGEO'],
|
||||
index,
|
||||
start,
|
||||
end,
|
||||
value
|
||||
)
|
||||
)
|
||||
|
||||
temp['com'].append(srow['CODGEO'])
|
||||
temp['id_indicateur'].append(index)
|
||||
temp['date_debut'].append(start)
|
||||
temp['date_fin'].append(end)
|
||||
temp['valeur'].append(value)
|
||||
|
||||
if args.verbose or arg.debug:
|
||||
t_end = time.time()
|
||||
logger.info('END stats import, time: {} seconds'.format(t_end - t_begin))
|
||||
|
||||
sys.exit()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue