Compare commits

..

No commits in common. "f84b6a136792e86da67f66d2f3db1d7e2a2a9d7c" and "04b45eb41230a2cad5232280e69d7edd221ea5e9" have entirely different histories.

3 changed files with 1 additions and 147 deletions

2
.gitignore vendored
View file

@ -1,2 +0,0 @@
.venv/
.ipynb_checkpoints/

View file

@ -4,8 +4,6 @@ import io
import numpy as np
import sys
import os
import re
import time
import logging
import argparse as arg
@ -53,9 +51,6 @@ def parse_args():
parser.add_argument('--states',
help='states raw csv file (inside source follder)',
default='region2021.csv')
parser.add_argument('--statistics',
help='statistics raw csv file to import',
default='statistiques.csv')
debug_group = parser.add_mutually_exclusive_group()
debug_group.add_argument('--verbose', '-V',
help='Verbose output',
@ -103,58 +98,6 @@ def import_towns_csv(raw_file):
return towns.loc[towns['TYPECOM'] == 'COM', ['COM','NCC', 'LIBELLE', 'DEP']]
def import_statistics_csv(raw_file):
"""
Process stats files
"""
logger.info('import town from {}'.format(raw_file))
stats = pd.read_csv(raw_file,
usecols=["CODGEO","SUPERF","P18_POP","P13_POP","P08_POP","D99_POP",
"NAIS1318","NAIS0813","NAIS9908","NAIS9099","NAIS8290","DECE1318",
"DECE0813","DECE9908","DECE9099","DECE8290","P18_LOG","P13_LOG",
"P08_LOG","D99_LOG","D90_LOG","D82_LOG", "P18_LOGVAC","P13_LOGVAC",
"P08_LOGVAC","D99_LOGVAC","D90_LOGVAC","D82_LOGVAC","P18_RP",
"P13_RP","P08_RP","D99_RP","D90_RP","D82_RP", "P18_RSECOCC",
"P13_RSECOCC","P08_RSECOCC","D99_RSECOCC","D90_RSECOCC",
"D82_RSECOCC"],
sep=';')
return stats
def get_single_date(attr):
logger.debug('get a date from {}'.format(attr))
m = re.match('^[D,P]([0-9]{2}).*$', attr)
if m.group(1) is None:
logger.error('Cand determine single date from {}'.format(attr))
if m.group(1).startswith(tuple(['0','1','2'])):
return 2000 + int(m.group(1)), 'null'
else:
return 1900 + int(m.group(1)), 'null'
def get_range_date(attr):
logger.debug('get date range from {}'.format(attr))
m = re.match('^[A-Z]*([0-9]{2})([0-9]{2}).*$', attr)
date = {}
try:
for i in [1,2]:
logger.debug('Process two digits: {}'.format(m.group(i)))
if m.group(i).startswith(tuple(['0','1','2'])):
date[i] = 2000 + int(m.group(i))
else:
date[i] = 1900 + int(m.group(i))
except Exception as e:
logger.error(
'Error when trying to determine daterange from {} - {}'.format(
attr,
e
)
)
return None, None
return date[1], date[2]
if __name__ == '__main__':
args = parse_args()
@ -174,6 +117,7 @@ if __name__ == '__main__':
logger.setLevel(logging.DEBUG)
logger.debug('DEBUG mode activated')
if not os.path.exists(args.source + '/' + args.states):
logger.critical('can\'t find source file for states')
sys.exit(1)
@ -193,92 +137,4 @@ if __name__ == '__main__':
towns = import_towns_csv(args.source + '/' + args.towns)
logger.debug(towns)
if not os.path.exists(args.source + '/' + args.statistics):
logger.critical('can\'t find source file for statistics')
sys.exit(1)
statistics = import_statistics_csv(args.source + '/' + args.statistics)
logger.debug(statistics)
# Create missing table : indicators
indicators = pd.DataFrame({'indicateur': [
'population',
'naissances',
'deces',
'logements',
'logements vacants',
'residences principales',
'residences secondaires et logements occasionnels'
],
'code': ['_POP','NAIS', 'DECE','_LOG', '_LOGVAC', '_RP', '_RSECOCC']},
index=[1,2,3,4,5,6,7])
logger.debug(indicators)
## Create departments capitals
dep_capitals = departments[['CHEFLIEU','DEP']]
dep_capitals.columns = ["CHEFLIEUDEP","DEP"]
departments = departments[["DEP","NCC","LIBELLE","REG"]]
logger.debug(dep_capitals)
## Create states capitals
states_capitals = states[['CHEFLIEU','REG']]
states_capitals.columns = ["CHEFLIEUREG","REG"]
departments = departments[["REG","NCC","LIBELLE"]]
logger.debug(states_capitals)
## create statistics dataframes
#
# We need to first iterate on statistics
if args.verbose or arg.debug:
t_begin = time.time()
logger.info('BEGIN - import stats')
c_stats = pd.DataFrame(columns = ['com','id_indicateur','date_debut',
'date_fin','valeur']
)
temp = {"com" : [], "id_indicateur" : [],
"date_debut" : [],
"date_fin" : [],
"valeur" : []
}
for s_index,srow in statistics.iterrows():
for index, irow in indicators.iterrows():
if irow['code'].startswith('_'):
regex = irow['code'] + '$'
else:
regex = '^' + irow['code']
logger.debug('Process indicator {}'.format(regex))
selection = srow.filter(regex=regex)
for attribute, value in selection.items():
logger.debug('check code: {}'.format(irow['code']))
if irow['code'].startswith('_'):
start,end = get_single_date(attribute)
else:
start,end = get_range_date(attribute)
if start is None or end is None:
logger.error('Can\'t process line, continue to next')
continue
logger.debug(
'town:{}, id_indic: {}, start: {}, end: {}, value:{}'
.format(
srow['CODGEO'],
index,
start,
end,
value
)
)
temp['com'].append(srow['CODGEO'])
temp['id_indicateur'].append(index)
temp['date_debut'].append(start)
temp['date_fin'].append(end)
temp['valeur'].append(value)
if args.verbose or arg.debug:
t_end = time.time()
logger.info('END stats import, time: {} seconds'.format(t_end - t_begin))
sys.exit()