Compare commits

..

6 commits

3 changed files with 147 additions and 1 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
.venv/
.ipynb_checkpoints/

View file

@ -4,6 +4,8 @@ import io
import numpy as np import numpy as np
import sys import sys
import os import os
import re
import time
import logging import logging
import argparse as arg import argparse as arg
@ -51,6 +53,9 @@ def parse_args():
parser.add_argument('--states', parser.add_argument('--states',
help='states raw csv file (inside source follder)', help='states raw csv file (inside source follder)',
default='region2021.csv') default='region2021.csv')
parser.add_argument('--statistics',
help='statistics raw csv file to import',
default='statistiques.csv')
debug_group = parser.add_mutually_exclusive_group() debug_group = parser.add_mutually_exclusive_group()
debug_group.add_argument('--verbose', '-V', debug_group.add_argument('--verbose', '-V',
help='Verbose output', help='Verbose output',
@ -98,6 +103,58 @@ def import_towns_csv(raw_file):
return towns.loc[towns['TYPECOM'] == 'COM', ['COM','NCC', 'LIBELLE', 'DEP']] return towns.loc[towns['TYPECOM'] == 'COM', ['COM','NCC', 'LIBELLE', 'DEP']]
def import_statistics_csv(raw_file):
"""
Process stats files
"""
logger.info('import town from {}'.format(raw_file))
stats = pd.read_csv(raw_file,
usecols=["CODGEO","SUPERF","P18_POP","P13_POP","P08_POP","D99_POP",
"NAIS1318","NAIS0813","NAIS9908","NAIS9099","NAIS8290","DECE1318",
"DECE0813","DECE9908","DECE9099","DECE8290","P18_LOG","P13_LOG",
"P08_LOG","D99_LOG","D90_LOG","D82_LOG", "P18_LOGVAC","P13_LOGVAC",
"P08_LOGVAC","D99_LOGVAC","D90_LOGVAC","D82_LOGVAC","P18_RP",
"P13_RP","P08_RP","D99_RP","D90_RP","D82_RP", "P18_RSECOCC",
"P13_RSECOCC","P08_RSECOCC","D99_RSECOCC","D90_RSECOCC",
"D82_RSECOCC"],
sep=';')
return stats
def get_single_date(attr):
logger.debug('get a date from {}'.format(attr))
m = re.match('^[D,P]([0-9]{2}).*$', attr)
if m.group(1) is None:
logger.error('Cand determine single date from {}'.format(attr))
if m.group(1).startswith(tuple(['0','1','2'])):
return 2000 + int(m.group(1)), 'null'
else:
return 1900 + int(m.group(1)), 'null'
def get_range_date(attr):
logger.debug('get date range from {}'.format(attr))
m = re.match('^[A-Z]*([0-9]{2})([0-9]{2}).*$', attr)
date = {}
try:
for i in [1,2]:
logger.debug('Process two digits: {}'.format(m.group(i)))
if m.group(i).startswith(tuple(['0','1','2'])):
date[i] = 2000 + int(m.group(i))
else:
date[i] = 1900 + int(m.group(i))
except Exception as e:
logger.error(
'Error when trying to determine daterange from {} - {}'.format(
attr,
e
)
)
return None, None
return date[1], date[2]
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args() args = parse_args()
@ -117,7 +174,6 @@ if __name__ == '__main__':
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
logger.debug('DEBUG mode activated') logger.debug('DEBUG mode activated')
if not os.path.exists(args.source + '/' + args.states): if not os.path.exists(args.source + '/' + args.states):
logger.critical('can\'t find source file for states') logger.critical('can\'t find source file for states')
sys.exit(1) sys.exit(1)
@ -137,4 +193,92 @@ if __name__ == '__main__':
towns = import_towns_csv(args.source + '/' + args.towns) towns = import_towns_csv(args.source + '/' + args.towns)
logger.debug(towns) logger.debug(towns)
if not os.path.exists(args.source + '/' + args.statistics):
logger.critical('can\'t find source file for statistics')
sys.exit(1)
statistics = import_statistics_csv(args.source + '/' + args.statistics)
logger.debug(statistics)
# Create missing table : indicators
indicators = pd.DataFrame({'indicateur': [
'population',
'naissances',
'deces',
'logements',
'logements vacants',
'residences principales',
'residences secondaires et logements occasionnels'
],
'code': ['_POP','NAIS', 'DECE','_LOG', '_LOGVAC', '_RP', '_RSECOCC']},
index=[1,2,3,4,5,6,7])
logger.debug(indicators)
## Create departments capitals
dep_capitals = departments[['CHEFLIEU','DEP']]
dep_capitals.columns = ["CHEFLIEUDEP","DEP"]
departments = departments[["DEP","NCC","LIBELLE","REG"]]
logger.debug(dep_capitals)
## Create states capitals
states_capitals = states[['CHEFLIEU','REG']]
states_capitals.columns = ["CHEFLIEUREG","REG"]
departments = departments[["REG","NCC","LIBELLE"]]
logger.debug(states_capitals)
## create statistics dataframes
#
# We need to first iterate on statistics
if args.verbose or arg.debug:
t_begin = time.time()
logger.info('BEGIN - import stats')
c_stats = pd.DataFrame(columns = ['com','id_indicateur','date_debut',
'date_fin','valeur']
)
temp = {"com" : [], "id_indicateur" : [],
"date_debut" : [],
"date_fin" : [],
"valeur" : []
}
for s_index,srow in statistics.iterrows():
for index, irow in indicators.iterrows():
if irow['code'].startswith('_'):
regex = irow['code'] + '$'
else:
regex = '^' + irow['code']
logger.debug('Process indicator {}'.format(regex))
selection = srow.filter(regex=regex)
for attribute, value in selection.items():
logger.debug('check code: {}'.format(irow['code']))
if irow['code'].startswith('_'):
start,end = get_single_date(attribute)
else:
start,end = get_range_date(attribute)
if start is None or end is None:
logger.error('Can\'t process line, continue to next')
continue
logger.debug(
'town:{}, id_indic: {}, start: {}, end: {}, value:{}'
.format(
srow['CODGEO'],
index,
start,
end,
value
)
)
temp['com'].append(srow['CODGEO'])
temp['id_indicateur'].append(index)
temp['date_debut'].append(start)
temp['date_fin'].append(end)
temp['valeur'].append(value)
if args.verbose or arg.debug:
t_end = time.time()
logger.info('END stats import, time: {} seconds'.format(t_end - t_begin))
sys.exit() sys.exit()