From 3d8c01e3c7dda6fe58f441c0c5e40b79ac5abe12 Mon Sep 17 00:00:00 2001 From: Yorick Barbanneau Date: Mon, 18 Apr 2022 12:54:15 +0200 Subject: [PATCH 1/6] Add .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9e0298a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.venv/ +.ipynb_checkpoints/ From 6f48b59da9e0c2c34446de188a06245b99858e69 Mon Sep 17 00:00:00 2001 From: Yorick Barbanneau Date: Mon, 18 Apr 2022 13:25:07 +0200 Subject: [PATCH 2/6] Import statistics csv --- create_db.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/create_db.py b/create_db.py index a15cd43..5c4a846 100755 --- a/create_db.py +++ b/create_db.py @@ -51,6 +51,9 @@ def parse_args(): parser.add_argument('--states', help='states raw csv file (inside source follder)', default='region2021.csv') + parser.add_argument('--statistics', + help='statistics raw csv file to import', + default='statistiques.csv') debug_group = parser.add_mutually_exclusive_group() debug_group.add_argument('--verbose', '-V', help='Verbose output', @@ -98,6 +101,26 @@ def import_towns_csv(raw_file): return towns.loc[towns['TYPECOM'] == 'COM', ['COM','NCC', 'LIBELLE', 'DEP']] + +def import_statistics_csv(raw_file): + """ + Process stats files + """ + + logger.info('import town from {}'.format(raw_file)) + stats = pd.read_csv(raw_file, + usecols=["CODGEO","SUPERF","P18_POP","P13_POP","P08_POP","D99_POP", + "NAIS1318","NAIS0813","NAIS9908","NAIS9099","NAIS8290","DECE1318", + "DECE0813","DECE9908","DECE9099","DECE8290","P18_LOG","P13_LOG", + "P08_LOG","D99_LOG","D90_LOG","D82_LOG", "P18_LOGVAC","P13_LOGVAC", + "P08_LOGVAC","D99_LOGVAC","D90_LOGVAC","D82_LOGVAC","P18_RP", + "P13_RP","P08_RP","D99_RP","D90_RP","D82_RP", "P18_RSECOCC", + "P13_RSECOCC","P08_RSECOCC","D99_RSECOCC","D90_RSECOCC", + "D82_RSECOCC"], + sep=';') + return stats + + if __name__ == '__main__': args = parse_args() @@ -117,7 +140,6 @@ if __name__ == '__main__': logger.setLevel(logging.DEBUG) logger.debug('DEBUG mode activated') - if not os.path.exists(args.source + '/' + args.states): logger.critical('can\'t find source file for states') sys.exit(1) @@ -137,4 +159,10 @@ if __name__ == '__main__': towns = import_towns_csv(args.source + '/' + args.towns) logger.debug(towns) + if not os.path.exists(args.source + '/' + args.statistics): + logger.critical('can\'t find source file for statistics') + sys.exit(1) + statistics = import_statistics_csv(args.source + '/' + args.statistics) + logger.debug(statistics) + sys.exit() From 02488646a697b48bafb7b07de28754966f6dde1f Mon Sep 17 00:00:00 2001 From: Yorick Barbanneau Date: Mon, 18 Apr 2022 13:26:11 +0200 Subject: [PATCH 3/6] Rename notebook with correct extention --- notebook.ipnb => notebook.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename notebook.ipnb => notebook.ipynb (100%) diff --git a/notebook.ipnb b/notebook.ipynb similarity index 100% rename from notebook.ipnb rename to notebook.ipynb From 8490d85ab6a92addd6c50f09c83612022af603eb Mon Sep 17 00:00:00 2001 From: Yorick Barbanneau Date: Mon, 18 Apr 2022 13:55:15 +0200 Subject: [PATCH 4/6] Create indicator dataframe --- create_db.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/create_db.py b/create_db.py index 5c4a846..d199a90 100755 --- a/create_db.py +++ b/create_db.py @@ -164,5 +164,20 @@ if __name__ == '__main__': sys.exit(1) statistics = import_statistics_csv(args.source + '/' + args.statistics) logger.debug(statistics) + + # Create missing table : indicators + indicators = pd.DataFrame({'indicateur': [ + 'population', + 'naissances', + 'deces', + 'logements', + 'logements vacants', + 'residences principales', + 'residences secondaires et logements occasionnels' + ], + 'code': ['_POP','NAIS', 'DECES','_LOG', '_LOGVAC', '_RP', '_RSECOCC']}, + index=[1,2,3,4,5,6,7]) + logger.debug(indicators) + sys.exit() From aa2eebabba9bbdb898fbec8bed515c2595121a4e Mon Sep 17 00:00:00 2001 From: Yorick Barbanneau Date: Mon, 18 Apr 2022 14:07:25 +0200 Subject: [PATCH 5/6] Create capitals dataframes --- create_db.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/create_db.py b/create_db.py index d199a90..9af7bac 100755 --- a/create_db.py +++ b/create_db.py @@ -179,5 +179,16 @@ if __name__ == '__main__': index=[1,2,3,4,5,6,7]) logger.debug(indicators) + ## Create departments capitals + dep_capitals = departments[['CHEFLIEU','DEP']] + dep_capitals.columns = ["CHEFLIEUDEP","DEP"] + departments = departments[["DEP","NCC","LIBELLE","REG"]] + logger.debug(dep_capitals) + + ## Create states capitals + states_capitals = states[['CHEFLIEU','REG']] + states_capitals.columns = ["CHEFLIEUREG","REG"] + departments = departments[["REG","NCC","LIBELLE"]] + logger.debug(states_capitals) sys.exit() From f84b6a136792e86da67f66d2f3db1d7e2a2a9d7c Mon Sep 17 00:00:00 2001 From: Yorick Barbanneau Date: Mon, 18 Apr 2022 18:02:02 +0200 Subject: [PATCH 6/6] Process statistics --- create_db.py | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/create_db.py b/create_db.py index 9af7bac..e13071d 100755 --- a/create_db.py +++ b/create_db.py @@ -4,6 +4,8 @@ import io import numpy as np import sys import os +import re +import time import logging import argparse as arg @@ -120,6 +122,38 @@ def import_statistics_csv(raw_file): sep=';') return stats +def get_single_date(attr): + logger.debug('get a date from {}'.format(attr)) + m = re.match('^[D,P]([0-9]{2}).*$', attr) + if m.group(1) is None: + logger.error('Cand determine single date from {}'.format(attr)) + + if m.group(1).startswith(tuple(['0','1','2'])): + return 2000 + int(m.group(1)), 'null' + else: + return 1900 + int(m.group(1)), 'null' + +def get_range_date(attr): + logger.debug('get date range from {}'.format(attr)) + m = re.match('^[A-Z]*([0-9]{2})([0-9]{2}).*$', attr) + date = {} + try: + for i in [1,2]: + logger.debug('Process two digits: {}'.format(m.group(i))) + if m.group(i).startswith(tuple(['0','1','2'])): + date[i] = 2000 + int(m.group(i)) + else: + date[i] = 1900 + int(m.group(i)) + except Exception as e: + logger.error( + 'Error when trying to determine daterange from {} - {}'.format( + attr, + e + ) + ) + return None, None + + return date[1], date[2] if __name__ == '__main__': args = parse_args() @@ -175,7 +209,7 @@ if __name__ == '__main__': 'residences principales', 'residences secondaires et logements occasionnels' ], - 'code': ['_POP','NAIS', 'DECES','_LOG', '_LOGVAC', '_RP', '_RSECOCC']}, + 'code': ['_POP','NAIS', 'DECE','_LOG', '_LOGVAC', '_RP', '_RSECOCC']}, index=[1,2,3,4,5,6,7]) logger.debug(indicators) @@ -190,5 +224,61 @@ if __name__ == '__main__': states_capitals.columns = ["CHEFLIEUREG","REG"] departments = departments[["REG","NCC","LIBELLE"]] logger.debug(states_capitals) + + ## create statistics dataframes + # + # We need to first iterate on statistics + if args.verbose or arg.debug: + t_begin = time.time() + logger.info('BEGIN - import stats') + + c_stats = pd.DataFrame(columns = ['com','id_indicateur','date_debut', + 'date_fin','valeur'] + ) + temp = {"com" : [], "id_indicateur" : [], + "date_debut" : [], + "date_fin" : [], + "valeur" : [] + } + for s_index,srow in statistics.iterrows(): + for index, irow in indicators.iterrows(): + if irow['code'].startswith('_'): + regex = irow['code'] + '$' + else: + regex = '^' + irow['code'] + + logger.debug('Process indicator {}'.format(regex)) + selection = srow.filter(regex=regex) + for attribute, value in selection.items(): + logger.debug('check code: {}'.format(irow['code'])) + if irow['code'].startswith('_'): + start,end = get_single_date(attribute) + else: + start,end = get_range_date(attribute) + + if start is None or end is None: + logger.error('Can\'t process line, continue to next') + continue + + logger.debug( + 'town:{}, id_indic: {}, start: {}, end: {}, value:{}' + .format( + srow['CODGEO'], + index, + start, + end, + value + ) + ) + + temp['com'].append(srow['CODGEO']) + temp['id_indicateur'].append(index) + temp['date_debut'].append(start) + temp['date_fin'].append(end) + temp['valeur'].append(value) + + if args.verbose or arg.debug: + t_end = time.time() + logger.info('END stats import, time: {} seconds'.format(t_end - t_begin)) sys.exit()