cbdb_sqlite/scripts/create_addresses_table.py at master · cbdb-project/cbdb_sqlite · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
import argparse
import sqlite3
import logging
from typing import Optional, List, Tuple, Dict
from dataclasses import dataclass
from datetime import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class AddressHierarchyBuilder:
    """
    Address hierarchy relationship builder - based on Prof. Michael Fuller's VB code logic
    Handles time segment splitting and multi-level belongs relationships
    Preserves gaps in data to tell the most continuous story possible
    """

    def __init__(self, db_path: str = "latest.db"):
        self.db_path = db_path
        self.conn = None
        self.cursor = None

    def __enter__(self):
        self.conn = sqlite3.connect(self.db_path)
        self.conn.row_factory = sqlite3.Row
        self.cursor = self.conn.cursor()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.conn:
            if exc_type is None:
                self.conn.commit()
            else:
                self.conn.rollback()
            self.conn.close()

    def execute(self, sql: str, params: tuple = ()) -> int:
        """Execute SQL and return affected row count"""
        self.cursor.execute(sql, params)
        return self.cursor.rowcount

    def safe_min(self, *values):
        """Safe min function that ignores None values"""
        valid_values = [v for v in values if v is not None]
        return min(valid_values) if valid_values else None

    def safe_max(self, *values):
        """Safe max function that ignores None values"""
        valid_values = [v for v in values if v is not None]
        return max(valid_values) if valid_values else None

    def clean_belongs_data(self):
        """
        Clean invalid data in ADDR_BELONGS_DATA
        This is a key step in Michael's code
        """
        logger.info("Cleaning belongs data...")

        # Create temporary table to store cleaned data
        self.execute("DROP TABLE IF EXISTS CLEANED_BELONGS_DATA")
        self.execute("""
            CREATE TEMP TABLE CLEANED_BELONGS_DATA (
                c_addr_id INTEGER,
                c_belongs_to INTEGER,
                c_firstyear INTEGER,
                c_lastyear INTEGER
            )
        """)

        # Get all belongs relationships
        self.cursor.execute("""
            SELECT abd.*,
                   ac1.c_firstyear as addr_first,
                   ac1.c_lastyear as addr_last,
                   ac2.c_firstyear as belongs_first,
                   ac2.c_lastyear as belongs_last
            FROM ADDR_BELONGS_DATA abd
            JOIN ADDR_CODES ac1 ON abd.c_addr_id = ac1.c_addr_id
            LEFT JOIN ADDR_CODES ac2 ON abd.c_belongs_to = ac2.c_addr_id
        """)

        rows = self.cursor.fetchall()
        valid_count = 0
        invalid_count = 0

        for row in rows:
            # Rule 1: Exclude Unknown (c_belongs_to = 0 or NULL)
            if not row['c_belongs_to'] or row['c_belongs_to'] == 0:
                invalid_count += 1
                continue

            # Rule 2: belongs_to unit must exist
            if row['belongs_first'] is None or row['belongs_last'] is None:
                logger.warning(f"Belongs_to unit {row['c_belongs_to']} does not exist")
                invalid_count += 1
                continue

            # Get time values, handle None cases
            abd_first = row['c_firstyear'] if row['c_firstyear'] is not None else row['addr_first']
            abd_last = row['c_lastyear'] if row['c_lastyear'] is not None else row['addr_last']

            # Calculate effective time range
            effective_first = self.safe_max(abd_first, row['addr_first'], row['belongs_first'])
            effective_last = self.safe_min(abd_last, row['addr_last'], row['belongs_last'])

            if effective_first is None or effective_last is None:
                logger.warning(f"Time range contains NULL values: {row['c_addr_id']} -> {row['c_belongs_to']}")
                invalid_count += 1
                continue

            if effective_first > effective_last:
                logger.warning(f"Invalid time range: {row['c_addr_id']} -> {row['c_belongs_to']} "
                             f"({effective_first} > {effective_last})")
                invalid_count += 1
                continue

            # Insert cleaned data
            self.execute("""
                INSERT INTO CLEANED_BELONGS_DATA
                (c_addr_id, c_belongs_to, c_firstyear, c_lastyear)
                VALUES (?, ?, ?, ?)
            """, (row['c_addr_id'], row['c_belongs_to'],
                  effective_first, effective_last))
            valid_count += 1

        logger.info(f"Data cleaning completed: {valid_count} valid, {invalid_count} invalid")

    def build_time_segments_with_gaps(self):
        """
        Build time segments including gaps
        This preserves the gaps in data and tells the most continuous story
        """
        logger.info("Building time segments with gap filling...")

        # Create result table
        self.execute("DROP TABLE IF EXISTS TIME_SEGMENTS")
        self.execute("""
            CREATE TEMP TABLE TIME_SEGMENTS (
                c_addr_id INTEGER,
                segment_start INTEGER,
                segment_end INTEGER,
                belongs_chain TEXT,
                level1_id INTEGER,
                level1_start INTEGER,
                level1_end INTEGER,
                level2_id INTEGER,
                level2_start INTEGER,
                level2_end INTEGER,
                level3_id INTEGER,
                level3_start INTEGER,
                level3_end INTEGER,
                level4_id INTEGER,
                level4_start INTEGER,
                level4_end INTEGER,
                level5_id INTEGER,
                level5_start INTEGER,
                level5_end INTEGER
            )
        """)

        # Get all addresses with valid year data
        self.cursor.execute("""
            SELECT c_addr_id, c_firstyear, c_lastyear
            FROM ADDR_CODES
            WHERE c_firstyear IS NOT NULL AND c_lastyear IS NOT NULL
        """)
        addresses = self.cursor.fetchall()

        logger.info(f"Processing {len(addresses)} addresses with valid year data...")

        for addr_row in addresses:
            addr_id = addr_row['c_addr_id']
            addr_first = addr_row['c_firstyear']
            addr_last = addr_row['c_lastyear']

            # Skip if years are invalid
            if addr_first is None or addr_last is None or addr_first > addr_last:
                logger.warning(f"Skipping address {addr_id} with invalid years: {addr_first}-{addr_last}")
                continue

            # Get all level 1 belongs relationships for this address
            self.cursor.execute("""
                SELECT DISTINCT c_belongs_to, c_firstyear, c_lastyear
                FROM CLEANED_BELONGS_DATA
                WHERE c_addr_id = ?
                ORDER BY c_firstyear
            """, (addr_id,))

            level1_belongs = self.cursor.fetchall()

            if not level1_belongs:
                # No belongs relationship for entire period
                self._insert_segment(addr_id, addr_first, addr_last, {})
            else:
                # Process each L1 relationship and fill gaps
                current_year = addr_first

                for l1 in level1_belongs:
                    l1_start = l1['c_firstyear']
                    l1_end = l1['c_lastyear']
                    l1_id = l1['c_belongs_to']

                    # If there's a gap before this L1 relationship
                    if current_year < l1_start:
                        # Insert gap record with only L1 (no deeper levels)
                        gap_chain = {'level1': {
                            'id': l1_id,
                            'start': current_year,
                            'end': l1_start - 1
                        }}
                        self._insert_segment(addr_id, current_year, l1_start - 1, gap_chain)

                    # Process the actual L1 period with its nested relationships
                    self._process_level1_with_gaps(addr_id, l1_id, l1_start, l1_end)

                    current_year = l1_end + 1

                # Fill gap at the end if needed
                if addr_last is not None and current_year <= addr_last:
                    # Use the last L1 belongs for the gap
                    if level1_belongs:
                        last_l1 = level1_belongs[-1]
                        gap_chain = {'level1': {
                            'id': last_l1['c_belongs_to'],
                            'start': current_year,
                            'end': addr_last
                        }}
                        self._insert_segment(addr_id, current_year, addr_last, gap_chain)

    def _process_level1_with_gaps(self, addr_id: int, l1_id: int, l1_start: int, l1_end: int):
        """
        Process a Level 1 belongs period, filling gaps in Level 2+ relationships
        """
        if l1_start is None or l1_end is None:
            return

        # Get Level 2 relationships for this L1
        self.cursor.execute("""
            SELECT DISTINCT c_belongs_to, c_firstyear, c_lastyear
            FROM CLEANED_BELONGS_DATA
            WHERE c_addr_id = ?
              AND c_firstyear <= ?
              AND c_lastyear >= ?
            ORDER BY c_firstyear
        """, (l1_id, l1_end, l1_start))

        level2_belongs = self.cursor.fetchall()

        if not level2_belongs:
            # No Level 2 for entire L1 period
            chain = {'level1': {'id': l1_id, 'start': l1_start, 'end': l1_end}}
            self._insert_segment(addr_id, l1_start, l1_end, chain)
        else:
            # Process L2 relationships and fill gaps
            current_year = l1_start

            for l2 in level2_belongs:
                # Calculate intersection with L1 period
                l2_effective_start = max(l2['c_firstyear'], l1_start)
                l2_effective_end = min(l2['c_lastyear'], l1_end)

                if l2_effective_start > l2_effective_end:
                    continue

                # Fill gap before this L2 if needed
                if current_year < l2_effective_start:
                    gap_chain = {
                        'level1': {'id': l1_id, 'start': current_year, 'end': l2_effective_start - 1}
                    }
                    self._insert_segment(addr_id, current_year, l2_effective_start - 1, gap_chain)

                # Process the actual L2 period with deeper levels
                self._process_level2_with_gaps(addr_id, l1_id, l1_start, l1_end,
                                              l2['c_belongs_to'], l2_effective_start, l2_effective_end)

                current_year = l2_effective_end + 1

            # Fill gap at the end of L1 period if needed
            if current_year <= l1_end:
                gap_chain = {
                    'level1': {'id': l1_id, 'start': current_year, 'end': l1_end}
                }
                self._insert_segment(addr_id, current_year, l1_end, gap_chain)

    def _process_level2_with_gaps(self, addr_id: int, l1_id: int, l1_start: int, l1_end: int,
                                 l2_id: int, l2_start: int, l2_end: int):
        """
        Process Level 2 and deeper, continuing to fill gaps
        """
        if l2_start is None or l2_end is None:
            return

        # Get Level 3 relationships
        self.cursor.execute("""
            SELECT DISTINCT c_belongs_to, c_firstyear, c_lastyear
            FROM CLEANED_BELONGS_DATA
            WHERE c_addr_id = ?
              AND c_firstyear <= ?
              AND c_lastyear >= ?
            ORDER BY c_firstyear
        """, (l2_id, l2_end, l2_start))

        level3_belongs = self.cursor.fetchall()

        if not level3_belongs:
            # No Level 3 for entire L2 period
            chain = {
                'level1': {'id': l1_id, 'start': l1_start, 'end': l1_end},
                'level2': {'id': l2_id, 'start': l2_start, 'end': l2_end}
            }
            self._insert_segment(addr_id, l2_start, l2_end, chain)
        else:
            # Process L3 relationships and fill gaps
            current_year = l2_start

            for l3 in level3_belongs:
                # Calculate intersection
                l3_effective_start = max(l3['c_firstyear'], l2_start)
                l3_effective_end = min(l3['c_lastyear'], l2_end)

                if l3_effective_start > l3_effective_end:
                    continue

                # Fill gap before this L3
                if current_year < l3_effective_start:
                    gap_chain = {
                        'level1': {'id': l1_id, 'start': l1_start, 'end': l1_end},
                        'level2': {'id': l2_id, 'start': current_year, 'end': l3_effective_start - 1}
                    }
                    self._insert_segment(addr_id, current_year, l3_effective_start - 1, gap_chain)

                # Create segment with L3
                chain = {
                    'level1': {'id': l1_id, 'start': l1_start, 'end': l1_end},
                    'level2': {'id': l2_id, 'start': l2_start, 'end': l2_end},
                    'level3': {'id': l3['c_belongs_to'], 'start': l3_effective_start, 'end': l3_effective_end}
                }

                # Continue to L4 and L5 if needed
                self._process_deeper_levels(addr_id, chain, l3['c_belongs_to'],
                                           l3_effective_start, l3_effective_end, 3)

                current_year = l3_effective_end + 1

            # Fill gap at end of L2 period
            if current_year <= l2_end:
                gap_chain = {
                    'level1': {'id': l1_id, 'start': l1_start, 'end': l1_end},
                    'level2': {'id': l2_id, 'start': current_year, 'end': l2_end}
                }
                self._insert_segment(addr_id, current_year, l2_end, gap_chain)

    def _process_deeper_levels(self, addr_id: int, chain: Dict, parent_id: int,
                              start: int, end: int, current_level: int):
        """
        Generic processor for levels 4 and 5
        """
        if start is None or end is None:
            return

        if current_level >= 5:
            # Already at max depth, save the segment
            self._insert_segment(addr_id, start, end, chain)
            return

        next_level = current_level + 1

        # Get next level relationships
        self.cursor.execute("""
            SELECT DISTINCT c_belongs_to, c_firstyear, c_lastyear
            FROM CLEANED_BELONGS_DATA
            WHERE c_addr_id = ?
              AND c_firstyear <= ?
              AND c_lastyear >= ?
            ORDER BY c_firstyear
        """, (parent_id, end, start))

        next_belongs = self.cursor.fetchall()

        if not next_belongs:
            # No deeper level, save current chain
            self._insert_segment(addr_id, start, end, chain)
        else:
            # Process with gaps
            current_year = start

            for nb in next_belongs:
                nb_start = max(nb['c_firstyear'], start)
                nb_end = min(nb['c_lastyear'], end)

                if nb_start > nb_end:
                    continue

                # Fill gap before
                if current_year < nb_start:
                    self._insert_segment(addr_id, current_year, nb_start - 1, chain)

                # Create new chain with next level
                new_chain = chain.copy()
                new_chain[f'level{next_level}'] = {
                    'id': nb['c_belongs_to'],
                    'start': nb_start,
                    'end': nb_end
                }

                # Continue deeper
                self._process_deeper_levels(addr_id, new_chain, nb['c_belongs_to'],
                                          nb_start, nb_end, next_level)

                current_year = nb_end + 1

            # Fill gap at end
            if current_year <= end:
                self._insert_segment(addr_id, current_year, end, chain)

    def _insert_segment(self, addr_id: int, start: int, end: int, chain: Dict):
        """Insert a time segment record"""
        if start is None or end is None:
            return

        values = [addr_id, start, end, str(chain)]

        # Add level information
        for i in range(1, 6):
            if f'level{i}' in chain:
                values.extend([
                    chain[f'level{i}']['id'],
                    chain[f'level{i}'].get('start', start),
                    chain[f'level{i}'].get('end', end)
                ])
            else:
                values.extend([None, None, None])

        placeholders = ','.join(['?' for _ in values])
        self.execute(f"""
            INSERT INTO TIME_SEGMENTS VALUES ({placeholders})
        """, tuple(values))

    def build_final_addresses_table(self):
        """Build final ADDRESSES table"""
        logger.info("Building final ADDRESSES table...")

        # Drop old table
        self.execute("DROP TABLE IF EXISTS ADDRESSES")

        # Create new table matching Michael's structure
        self.execute("""
            CREATE TABLE ADDRESSES (
                c_addr_id INTEGER,
                c_name TEXT,
                c_name_chn TEXT,
                c_admin_type TEXT,
                c_firstyear INTEGER,
                c_lastyear INTEGER,
                c_belongs_firstyear INTEGER,
                c_belongs_lastyear INTEGER,
                x_coord REAL,
                y_coord REAL,
                belongs1_ID INTEGER,
                belongs1_Name TEXT,
                belongs1_Name_chn TEXT,
                belongs2_ID INTEGER,
                belongs2_Name TEXT,
                belongs2_Name_chn TEXT,
                belongs3_ID INTEGER,
                belongs3_Name TEXT,
                belongs3_Name_chn TEXT,
                belongs4_ID INTEGER,
                belongs4_Name TEXT,
                belongs4_Name_chn TEXT,
                belongs5_ID INTEGER,
                belongs5_Name TEXT,
                belongs5_Name_chn TEXT
            )
        """)

        # Build final data from TIME_SEGMENTS
        self.execute("""
            INSERT INTO ADDRESSES
            SELECT
                ts.c_addr_id,
                ac.c_name,
                ac.c_name_chn,
                ac.c_admin_type,
                ac.c_firstyear,
                ac.c_lastyear,
                ts.segment_start as c_belongs_firstyear,
                ts.segment_end as c_belongs_lastyear,
                ac.x_coord,
                ac.y_coord,
                ts.level1_id,
                a1.c_name,
                a1.c_name_chn,
                ts.level2_id,
                a2.c_name,
                a2.c_name_chn,
                ts.level3_id,
                a3.c_name,
                a3.c_name_chn,
                ts.level4_id,
                a4.c_name,
                a4.c_name_chn,
                ts.level5_id,
                a5.c_name,
                a5.c_name_chn
            FROM TIME_SEGMENTS ts
            JOIN ADDR_CODES ac ON ts.c_addr_id = ac.c_addr_id
            LEFT JOIN ADDR_CODES a1 ON ts.level1_id = a1.c_addr_id
            LEFT JOIN ADDR_CODES a2 ON ts.level2_id = a2.c_addr_id
            LEFT JOIN ADDR_CODES a3 ON ts.level3_id = a3.c_addr_id
            LEFT JOIN ADDR_CODES a4 ON ts.level4_id = a4.c_addr_id
            LEFT JOIN ADDR_CODES a5 ON ts.level5_id = a5.c_addr_id
            ORDER BY ts.c_addr_id, ts.segment_start
        """)

        count = self.cursor.rowcount
        logger.info(f"ADDRESSES table created with {count} records")

        # Verify example cases
        self._verify_example_cases()

    def _verify_example_cases(self):
        """Verify the specific cases mentioned in Michael's emails"""
        # Check Jiangle (100149)
        logger.info("Verifying Jiangle (100149)...")
        self.cursor.execute("""
            SELECT c_belongs_firstyear, c_belongs_lastyear,
                   belongs1_Name_chn, belongs2_Name_chn, belongs3_Name_chn
            FROM ADDRESSES
            WHERE c_addr_id = 100149
            ORDER BY c_belongs_firstyear
        """)

        results = self.cursor.fetchall()
        if results:
            logger.info(f"Jiangle has {len(results)} records:")
            for row in results:
                logger.info(f"  {row['c_belongs_firstyear']}-{row['c_belongs_lastyear']}: "
                           f"{row['belongs1_Name_chn']} -> {row['belongs2_Name_chn'] or ''} -> "
                           f"{row['belongs3_Name_chn'] or ''}")

        # Check Jun county (4524) if it exists
        self.cursor.execute("""
            SELECT c_belongs_firstyear, c_belongs_lastyear,
                   belongs1_Name_chn, belongs2_Name_chn, belongs3_Name_chn, belongs4_Name_chn
            FROM ADDRESSES
            WHERE c_addr_id = 4524
            ORDER BY c_belongs_firstyear
            LIMIT 10
        """)

        results = self.cursor.fetchall()
        if results:
            logger.info(f"\nJun county (4524) has {len(results)} records (showing first 10):")
            for row in results:
                logger.info(f"  {row['c_belongs_firstyear']}-{row['c_belongs_lastyear']}: "
                           f"{row['belongs1_Name_chn']} -> {row['belongs2_Name_chn'] or ''} -> "
                           f"{row['belongs3_Name_chn'] or ''} -> {row['belongs4_Name_chn'] or ''}")

    def run(self):
        """Execute complete build process"""
        try:
            logger.info("="*60)
            logger.info("Starting address hierarchy build with gap preservation...")
            logger.info("="*60)

            # 1. Clean data
            self.clean_belongs_data()

            # 2. Build time segments with gaps
            self.build_time_segments_with_gaps()

            # 3. Generate final table
            self.build_final_addresses_table()

            logger.info("="*60)
            logger.info("Build completed!")
            logger.info("="*60)

        except Exception as e:
            logger.error(f"Build process error: {e}")
            import traceback
            traceback.print_exc()
            raise

# Usage
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Build the ADDRESSES table from the CBDB SQLite database.")
    parser.add_argument("--db", default="latest.db", help="Path to the SQLite database file to process")
    args = parser.parse_args()

    with AddressHierarchyBuilder(args.db) as builder:
        builder.run()