Process statistics

2022-04-18 18:02:02 +02:00 · 2022-04-18 18:02:02 +02:00 · f84b6a1367
commit f84b6a1367
parent aa2eebabba
1 changed files with 91 additions and 1 deletions
--- a/create_db.py
+++ b/create_db.py
@ -4,6 +4,8 @@ import io
 import numpy as np
 import sys
 import os
+import re
+import time
 import logging
 import argparse as arg

@ -120,6 +122,38 @@ def import_statistics_csv(raw_file):
            sep=';')
    return stats

+def get_single_date(attr):
+    logger.debug('get a date from {}'.format(attr))
+    m = re.match('^[D,P]([0-9]{2}).*$', attr)
+    if m.group(1) is None:
+        logger.error('Cand determine single date from {}'.format(attr))
+
+    if m.group(1).startswith(tuple(['0','1','2'])):
+        return 2000 + int(m.group(1)), 'null'
+    else:
+        return 1900 + int(m.group(1)), 'null'
+
+def get_range_date(attr):
+    logger.debug('get date range from {}'.format(attr))
+    m = re.match('^[A-Z]*([0-9]{2})([0-9]{2}).*$', attr)
+    date = {}
+    try:
+        for i in [1,2]:
+            logger.debug('Process two digits: {}'.format(m.group(i)))
+            if m.group(i).startswith(tuple(['0','1','2'])):
+                date[i] = 2000 + int(m.group(i))
+            else:
+                date[i] = 1900 + int(m.group(i))
+    except Exception as e:
+        logger.error(
+            'Error when trying to determine daterange from {} - {}'.format(
+                attr,
+                e
+            )
+        )
+        return None, None
+
+    return date[1], date[2]

 if __name__ == '__main__':
    args = parse_args()
@ -175,7 +209,7 @@ if __name__ == '__main__':
        'residences principales', 
        'residences secondaires et logements occasionnels'
    ],
-    'code': ['_POP','NAIS', 'DECES','_LOG', '_LOGVAC', '_RP',  '_RSECOCC']},
+    'code': ['_POP','NAIS', 'DECE','_LOG', '_LOGVAC', '_RP',  '_RSECOCC']},
        index=[1,2,3,4,5,6,7])
    logger.debug(indicators)
    
@ -190,5 +224,61 @@ if __name__ == '__main__':
    states_capitals.columns = ["CHEFLIEUREG","REG"]
    departments = departments[["REG","NCC","LIBELLE"]]
    logger.debug(states_capitals)
+    
+    ## create statistics dataframes
+    # 
+    # We need to first iterate on statistics
+    if args.verbose or arg.debug:
+        t_begin = time.time()
+        logger.info('BEGIN - import stats')
+        
+    c_stats = pd.DataFrame(columns = ['com','id_indicateur','date_debut',
+        'date_fin','valeur']
+    )
+    temp = {"com" : [], "id_indicateur" : [], 
+        "date_debut" : [], 
+        "date_fin" : [], 
+        "valeur" : []
+    }
+    for s_index,srow in statistics.iterrows():
+        for index, irow in indicators.iterrows():
+            if irow['code'].startswith('_'):
+                regex = irow['code'] + '$'
+            else:
+                regex = '^' + irow['code']
+
+            logger.debug('Process indicator {}'.format(regex))
+            selection = srow.filter(regex=regex)
+            for attribute, value in selection.items():
+                logger.debug('check code: {}'.format(irow['code']))
+                if irow['code'].startswith('_'):
+                    start,end = get_single_date(attribute)
+                else:
+                    start,end = get_range_date(attribute)
+                
+                if start is None or end is None:
+                   logger.error('Can\'t process line, continue to next')
+                   continue
+
+                logger.debug(
+                    'town:{}, id_indic: {}, start: {}, end: {}, value:{}'
+                    .format(
+                        srow['CODGEO'],
+                        index,
+                        start,
+                        end,
+                        value
+                    )
+                )
+
+                temp['com'].append(srow['CODGEO'])
+                temp['id_indicateur'].append(index)
+                temp['date_debut'].append(start)
+                temp['date_fin'].append(end)
+                temp['valeur'].append(value)
+
+    if args.verbose or arg.debug:
+        t_end = time.time()
+        logger.info('END stats import, time: {} seconds'.format(t_end - t_begin))

    sys.exit()