Complete Hansard Data Processing Workflow • hansardR

Processing Australian Parliamentary Hansard Data

The hansardR package provides a comprehensive toolkit for processing, validating, and analysing Australian Parliamentary Hansard CSV data. This vignette demonstrates the complete workflow from raw CSV files to a structured database ready for computational social science research.

Overview

The package handles parliamentary data with the following structure: - Sessions: Parliamentary sitting days - Members: MPs with party affiliations and electorates
- Debates: Major topics discussed in each session - Speeches: Individual contributions, questions, answers, and interjections

Setup

library(hansardR)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(DBI)

# Use sample data included with the package
sample_data_path <- system.file("extdata", "houseCSV", package = "hansardR")
db_path <- tempfile(fileext = ".db")  # Use temporary database for vignette

cat("Sample data location:", sample_data_path, "\n")
#> Sample data location: /home/runner/work/_temp/Library/hansardR/extdata/houseCSV
cat("Temporary database:", db_path, "\n")
#> Temporary database: /tmp/RtmpB1ZF7v/file1ec83e7e0eb7.db

# Show available sample data structure
if (dir.exists(sample_data_path)) {
  cat("\nAvailable sample years:\n")
  year_dirs <- list.dirs(sample_data_path, recursive = FALSE, full.names = FALSE)
  for (year in year_dirs) {
    year_path <- file.path(sample_data_path, year)
    files <- list.files(year_path, pattern = "*.csv")
    cat(" ", year, ": ", length(files), " files\n")
    for (file in files) {
      cat("   -", file, "\n")
    }
  }
}
#> 
#> Available sample years:
#>   2024 :  3  files
#>    - 2024-02-08_edit_step7.csv 
#>    - 2024-05-14_edit_step7.csv 
#>    - 2024-11-21_edit_step7.csv 
#>   2025 :  2  files
#>    - 2025-02-06_edit_step7.csv 
#>    - 2025-03-25_edit_step7.csv

Step 1: Database Creation

Create a new SQLite database with the standard Hansard schema:

# Create database with standard schema
con <- create_hansard_database(db_path, overwrite = TRUE)
#> Database triggers created
#> Standard schema created with optimizations
#> Database created successfully at: /tmp/RtmpB1ZF7v/file1ec83e7e0eb7.db

# Or connect to existing database
# con <- connect_hansard_database(db_path)

The database includes optimised indexes for: - Temporal queries (by date, year) - Member-based analysis (by party, electorate) - Content analysis (full-text search ready) - Debate structure (hierarchical organisation)

Step 2: File Discovery and Validation

Before processing, validate your CSV files to ensure data integrity:

# Find sample CSV files included with the package
sample_years <- list.dirs(sample_data_path, recursive = FALSE, full.names = TRUE)
sample_years <- sample_years[grepl("\\d{4}$", basename(sample_years))]  # Only year directories

# Get all sample files
sample_files <- c()
for (year_dir in sample_years) {
  files <- list.files(year_dir, pattern = "*_edit_step7.csv", full.names = TRUE)
  sample_files <- c(sample_files, files)
}

cat("Found", length(sample_files), "sample files across", length(sample_years), "years:\n")
#> Found 5 sample files across 2 years:
cat("Years available:", paste(basename(sample_years), collapse = ", "), "\n")
#> Years available: 2024, 2025

# Show file details
for (file in sample_files) {
  year <- basename(dirname(file))
  filename <- basename(file)
  size_kb <- round(file.size(file) / 1024, 1)
  cat(" ", year, "/", filename, " (", size_kb, " KB)\n", sep = "")
}
#>  2024/2024-02-08_edit_step7.csv (316.4 KB)
#>  2024/2024-05-14_edit_step7.csv (380.6 KB)
#>  2024/2024-11-21_edit_step7.csv (469.9 KB)
#>  2025/2025-02-06_edit_step7.csv (535.8 KB)
#>  2025/2025-03-25_edit_step7.csv (348.2 KB)

File Structure Validation

# Validate file structures
validation_results <- validate_csv_batch(sample_files)

# View validation summary
print(validation_results)
#> # A tibble: 5 × 9
#>   file_path    filename valid session_date row_count n_missing_cols n_extra_cols
#>   <chr>        <chr>    <lgl> <date>           <int>          <int>        <int>
#> 1 /home/runne… 2024-02… TRUE  2024-02-08           5              0            0
#> 2 /home/runne… 2024-05… TRUE  2024-05-14           5              0            0
#> 3 /home/runne… 2024-11… TRUE  2024-11-21           5              0            0
#> 4 /home/runne… 2025-02… TRUE  2025-02-06           5              0            0
#> 5 /home/runne… 2025-03… TRUE  2025-03-25           5              0            0
#> # ℹ 2 more variables: issues <chr>, error <chr>

# Check for issues
problems <- validation_results[!validation_results$valid, ]
if (nrow(problems) > 0) {
  cat("Files with issues:\n")
  print(problems[c("filename", "issues", "error")])
}

# Files ready for import
valid_files <- validation_results$file_path[validation_results$valid]
cat("\nValid files ready for import:", length(valid_files), "\n")
#> 
#> Valid files ready for import: 5

Step 3: Single File Import

Start by importing a single file to test the workflow:

# Import one file for testing
if (length(valid_files) > 0) {
  test_file <- valid_files[1]
  cat("Testing import with:", basename(test_file), "\n")
  
  success <- import_hansard_file(test_file, con, validate = TRUE)
  
  if (success) {
    cat("✓ Test import successful!\n")
    
    # Check what was imported
    stats <- get_database_stats(con)
    cat("Database now contains:\n")
    cat("  Sessions:", stats$sessions, "\n")
    cat("  Members:", stats$members, "\n")
    cat("  Speeches:", stats$speeches, "\n")
  }
}
#> Testing import with: 2024-02-08_edit_step7.csv
#> Processing: 2024-02-08_edit_step7.csv
#>   v Successfully imported 243 records
#> ✓ Test import successful!
#> Database now contains:
#>   Sessions: 1 
#>   Members: 44 
#>   Speeches: 243

Step 4: Batch Processing

Process a Single Year

# Process sample years
sample_year_dirs <- list.dirs(sample_data_path, recursive = FALSE, full.names = TRUE)
sample_year_dirs <- sample_year_dirs[grepl("\\d{4}$", basename(sample_year_dirs))]

for (year_dir in sample_year_dirs) {
  year_name <- basename(year_dir)
  cat("\n=== Processing", year_name, "===\n")
  
  # Show what files we're about to process
  csv_files <- list.files(year_dir, pattern = "*_edit_step7.csv", full.names = TRUE)
  cat("Files to process:\n")
  for (file in csv_files) {
    cat("  -", basename(file), "\n")
  }
  
  year_results <- import_hansard_year(
    year_dir, 
    con, 
    pattern = "*_edit_step7.csv",
    validate = TRUE,
    progress = FALSE  # Disable progress bar in vignette
  )
  
  # Show results
  if (nrow(year_results) > 0) {
    successful <- sum(year_results$success)
    total <- nrow(year_results)
    cat("Result: Imported", successful, "out of", total, "files successfully\n")
    
    if (successful < total) {
      failed_files <- year_results$filename[!year_results$success]
      cat("Failed files:", paste(failed_files, collapse = ", "), "\n")
    }
  }
}
#> 
#> === Processing 2024 ===
#> Files to process:
#>   - 2024-02-08_edit_step7.csv 
#>   - 2024-05-14_edit_step7.csv 
#>   - 2024-11-21_edit_step7.csv
#> Processing 2024 (3 files)
#> Processing: 2024-02-08_edit_step7.csv
#>   ! Session already exists (use force_reimport = TRUE to overwrite)
#> Processing: 2024-05-14_edit_step7.csv
#>   v Successfully imported 191 records
#> Processing: 2024-11-21_edit_step7.csv
#>   v Successfully imported 166 records
#> 
#> === Import Summary ===
#> Successful: 2/3 (66.7%)
#> Failed files: 2024-02-08_edit_step7.csv
#> Result: Imported 2 out of 3 files successfully
#> Failed files: 2024-02-08_edit_step7.csv 
#> 
#> === Processing 2025 ===
#> Files to process:
#>   - 2025-02-06_edit_step7.csv 
#>   - 2025-03-25_edit_step7.csv
#> Processing 2025 (2 files)
#> Processing: 2025-02-06_edit_step7.csv
#>   v Successfully imported 182 records
#> Processing: 2025-03-25_edit_step7.csv
#>   v Successfully imported 189 records
#> 
#> === Import Summary ===
#> Successful: 2/2 (100%)
#> Result: Imported 2 out of 2 files successfully

Process Multiple Years

# Process both available years in the sample data
available_years <- basename(list.dirs(sample_data_path, recursive = FALSE, full.names = FALSE))
available_years <- available_years[grepl("^\\d{4}$", available_years)]

cat("Processing all available sample years:", paste(available_years, collapse = ", "), "\n")
#> Processing all available sample years: 2024, 2025

all_results <- list()

for (year in available_years) {
  year_path <- file.path(sample_data_path, year)
  
  if (dir.exists(year_path)) {
    cat("\n=== Processing", year, "===\n")
    
    # Show what we're processing
    csv_files <- list.files(year_path, pattern = "*_edit_step7.csv")
    cat("Found", length(csv_files), "CSV files in", year, "\n")
    
    year_results <- import_hansard_year(
      year_path, 
      con,
      validate = TRUE,
      force_reimport = FALSE,  # Skip existing sessions
      progress = FALSE
    )
    
    all_results[[year]] <- year_results
    
    # Show summary
    if (nrow(year_results) > 0) {
      successful <- sum(year_results$success)
      total <- nrow(year_results)
      cat("Summary:", successful, "/", total, "files imported successfully\n")
    }
  } else {
    cat("Directory not found:", year_path, "\n")
  }
}
#> 
#> === Processing 2024 ===
#> Found 3 CSV files in 2024
#> Processing 2024 (3 files)
#> Processing: 2024-02-08_edit_step7.csv
#>   ! Session already exists (use force_reimport = TRUE to overwrite)
#> Processing: 2024-05-14_edit_step7.csv
#>   ! Session already exists (use force_reimport = TRUE to overwrite)
#> Processing: 2024-11-21_edit_step7.csv
#>   ! Session already exists (use force_reimport = TRUE to overwrite)
#> 
#> === Import Summary ===
#> Successful: 0/3 (0%)
#> Failed files: 2024-02-08_edit_step7.csv, 2024-05-14_edit_step7.csv, 2024-11-21_edit_step7.csv
#> Summary: 0 / 3 files imported successfully
#> 
#> === Processing 2025 ===
#> Found 2 CSV files in 2025
#> Processing 2025 (2 files)
#> Processing: 2025-02-06_edit_step7.csv
#>   ! Session already exists (use force_reimport = TRUE to overwrite)
#> Processing: 2025-03-25_edit_step7.csv
#>   ! Session already exists (use force_reimport = TRUE to overwrite)
#> 
#> === Import Summary ===
#> Successful: 0/2 (0%)
#> Failed files: 2025-02-06_edit_step7.csv, 2025-03-25_edit_step7.csv
#> Summary: 0 / 2 files imported successfully

# Combine results
if (length(all_results) > 0) {
  combined_results <- do.call(rbind, all_results)
  cat("\n=== Overall Import Summary ===\n")
  total_files <- nrow(combined_results)
  total_successful <- sum(combined_results$success)
  cat("Total files processed:", total_files, "\n")
  cat("Successfully imported:", total_successful, "\n")
  cat("Success rate:", round(100 * total_successful / total_files, 1), "%\n")
}
#> 
#> === Overall Import Summary ===
#> Total files processed: 5 
#> Successfully imported: 0 
#> Success rate: 0 %

Step 5: Data Exploration

Database Statistics

# Get comprehensive database statistics
stats <- get_database_stats(con)

cat("=== Database Statistics ===\n")
#> === Database Statistics ===
cat("Sessions:", stats$sessions, "\n")
#> Sessions: 5
cat("Members:", stats$members, "\n")
#> Members: 96
cat("Debates:", stats$debates, "\n")
#> Debates: 66
cat("Speeches:", stats$speeches, "\n")
#> Speeches: 971
cat("Date range:", as.character(stats$date_range[1]), "to", as.character(stats$date_range[2]), "\n")
#> Date range: 19761 to 20172
cat("Average speech length:", round(stats$avg_length, 1), "characters\n")
#> Average speech length: 2688.2 characters

cat("\nSpeech types:\n")
#> 
#> Speech types:
cat("  Questions:", stats$questions, "\n")
#>   Questions: NA
cat("  Answers:", stats$answers, "\n")
#>   Answers: NA
cat("  Speeches:", stats$speeches, "\n")
#>   Speeches: 971
cat("  Interjections:", stats$interjections, "\n")
#>   Interjections: NA

Top Speakers Analysis

# Get most active speakers
top_speakers <- get_top_speakers(con, limit = 15)
print(top_speakers)
#> # A tibble: 15 × 8
#>    full_name       party electorate total_speeches total_words avg_speech_length
#>    <chr>           <chr> <chr>               <int>       <int>             <dbl>
#>  1 NA              NA    NA                    452     1126765             2509.
#>  2 Dick, Milton MP ALP   Oxley                 172       87631             6259.
#>  3 Albanese, Anth… ALP   Grayndler              32       45079             1610.
#>  4 DEPUTY SPEAKER… NA    NA                     21       80747            10093.
#>  5 Chalmers, Jim … ALP   Rankin                 18       33477             2092.
#>  6 Sukkar, Michae… LP    Deakin                 16          NA               NA 
#>  7 Dutton, Peter … LNP   Dickson                15        1893              473.
#>  8 Burke, Tony MP  ALP   Watson                 14       12470             2078.
#>  9 Claydon, Sharo… ALP   Newcastle              14         207              207 
#> 10 Taylor, Angus … LP    Hume                   14        2469              247.
#> 11 Plibersek, Tan… ALP   Sydney                  8       11193             1599 
#> 12 Marles, Richar… ALP   Corio                   7       12228             2038 
#> 13 Chesters, Lisa… ALP   Bendigo                 6         576              192 
#> 14 Butler, Mark C… ALP   Hindmarsh               5       10475             2095 
#> 15 Clare, Jason D… ALP   Blaxland                5       11696             2339.
#> # ℹ 2 more variables: questions_asked <lgl>, answers_given <lgl>

# Party breakdown
party_activity <- top_speakers |>
  group_by(party) |>
  summarise(
    members = n(),
    total_speeches = sum(total_speeches),
    avg_speeches_per_member = round(mean(total_speeches), 1),
    .groups = "drop"
  ) |>
  arrange(desc(total_speeches))

print(party_activity)
#> # A tibble: 4 × 4
#>   party members total_speeches avg_speeches_per_member
#>   <chr>   <int>          <int>                   <dbl>
#> 1 NA          2            473                     473
#> 2 ALP        10            281                     281
#> 3 LP          2             30                      30
#> 4 LNP         1             15                      15

Step 6: Advanced Queries with dplyr

Get table references for direct dplyr querying:

# Get table references
tbls <- get_hansard_tables(con)

# Show table structure
cat("Available tables:\n")
#> Available tables:
for (name in names(tbls)) {
  cat(" ", name, "\n")
}
#>   sessions 
#>   members 
#>   debates 
#>   speeches

Temporal Analysis

# Questions over time by party (using our sample data)
questions_by_party_year <- tbls$speeches |>
  filter(is_question == 1) |>
  left_join(tbls$members, by = "member_id") |>
  left_join(tbls$sessions, by = "session_id") |>
  count(party, year, sort = TRUE) |>
  collect()

if (nrow(questions_by_party_year) > 0) {
  cat("Questions by party and year in sample data:\n")
  print(questions_by_party_year)
} else {
  cat("No questions found in sample data\n")
}
#> No questions found in sample data

# Session-level activity (all speech types)
session_activity <- tbls$speeches |>
  left_join(tbls$sessions, by = "session_id") |>
  group_by(session_date, year) |>
  summarise(
    total_speeches = n(),
    questions = sum(is_question, na.rm = TRUE),
    answers = sum(is_answer, na.rm = TRUE),
    interjections = sum(is_interjection, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(session_date) |>
  collect()

cat("\nSession-level activity:\n")
#> 
#> Session-level activity:
print(session_activity)
#> # A tibble: 5 × 6
#>   session_date  year total_speeches questions answers interjections
#>          <int> <int>          <int> <lgl>     <lgl>   <lgl>        
#> 1        19761 -4658            243 NA        NA      NA           
#> 2        19857 -4658            191 NA        NA      NA           
#> 3        20048 -4658            166 NA        NA      NA           
#> 4        20125 -4658            182 NA        NA      NA           
#> 5        20172 -4657            189 NA        NA      NA

Content Analysis

# Average speech length by party (using sample data)
speech_length_by_party <- tbls$speeches |>
  left_join(tbls$members, by = "member_id") |>
  filter(!is.na(party)) |>  # Only include rows with party information
  group_by(party) |>
  summarise(
    speech_count = n(),
    avg_length = round(mean(content_length, na.rm = TRUE), 1),
    median_length = round(median(content_length, na.rm = TRUE), 1),
    total_words = sum(content_length, na.rm = TRUE),
    .groups = "drop"
  ) |>
  filter(speech_count >= 1) |>  # Include all parties in sample data
  arrange(desc(avg_length)) |>
  collect()

cat("Speech length statistics by party:\n")
#> Speech length statistics by party:
print(speech_length_by_party)
#> # A tibble: 8 × 5
#>   party speech_count avg_length median_length total_words
#>   <chr>        <int>      <dbl>         <dbl>       <int>
#> 1 NATS             9     37116.        37116.       74231
#> 2 AG              12     10751.         5355        64508
#> 3 IND             15      7685.          563        99908
#> 4 ALP            369      2218.         1167       379359
#> 5 KAP              2       880           880          880
#> 6 LP              58       726.          332        18868
#> 7 LNP             31       351.          326         4217
#> 8 CA               2        NA            NA           NA

# Most active debates in sample data
popular_debates <- tbls$speeches |>
  left_join(tbls$debates, by = "debate_id") |>
  filter(!is.na(debate_title)) |>
  count(debate_title, sort = TRUE) |>
  collect()

cat("\nMost discussed topics in sample data:\n")
#> 
#> Most discussed topics in sample data:
print(popular_debates)
#> # A tibble: 25 × 2
#>    debate_title                     n
#>    <chr>                        <int>
#>  1 QUESTIONS WITHOUT NOTICE       528
#>  2 BILLS                          122
#>  3 STATEMENTS BY MEMBERS          106
#>  4 MATTERS OF PUBLIC IMPORTANCE    43
#>  5 ADJOURNMENT                     37
#>  6 CONSTITUENCY STATEMENTS         21
#>  7 CONDOLENCES                     21
#>  8 MOTIONS                         13
#>  9 COMMITTEES                      12
#> 10 BUSINESS                        12
#> # ℹ 15 more rows

# Speech type distribution
speech_types <- tbls$speeches |>
  summarise(
    total_records = n(),
    questions = sum(is_question, na.rm = TRUE),
    answers = sum(is_answer, na.rm = TRUE),
    speeches = sum(is_speech, na.rm = TRUE),
    interjections = sum(is_interjection, na.rm = TRUE),
    stage_directions = sum(is_stage_direction, na.rm = TRUE)
  ) |>
  collect()

cat("\nSpeech type distribution in sample data:\n")
#> 
#> Speech type distribution in sample data:
print(speech_types)
#> # A tibble: 1 × 6
#>   total_records questions answers speeches interjections stage_directions
#>           <int> <lgl>     <lgl>   <lgl>    <lgl>         <lgl>           
#> 1           971 NA        NA      NA       NA            NA

Member-Specific Analysis

# Most active members in sample data
sample_speakers <- tbls$speeches |>
  left_join(tbls$members, by = "member_id") |>
  left_join(tbls$sessions, by = "session_id") |>
  filter(!is.na(full_name)) |>  # Only include identified speakers
  group_by(full_name, party, electorate) |>
  summarise(
    sessions_active = n_distinct(session_id),
    total_contributions = n(),
    questions = sum(is_question, na.rm = TRUE),
    answers = sum(is_answer, na.rm = TRUE),
    speeches = sum(is_speech, na.rm = TRUE),
    avg_speech_length = round(mean(content_length, na.rm = TRUE), 1),
    .groups = "drop"
  ) |>
  arrange(desc(total_contributions)) |>
  collect()

cat("Member activity in sample data:\n")
#> Member activity in sample data:
print(sample_speakers)
#> # A tibble: 96 × 9
#>    full_name      party electorate sessions_active total_contributions questions
#>    <chr>          <chr> <chr>                <int>               <int> <lgl>    
#>  1 Dick, Milton … ALP   Oxley                    5                 172 NA       
#>  2 Albanese, Ant… ALP   Grayndler                4                  32 NA       
#>  3 DEPUTY SPEAKE… NA    NA                       3                  21 NA       
#>  4 Chalmers, Jim… ALP   Rankin                   3                  18 NA       
#>  5 Sukkar, Micha… LP    Deakin                   4                  16 NA       
#>  6 Dutton, Peter… LNP   Dickson                  4                  15 NA       
#>  7 Burke, Tony MP ALP   Watson                   5                  14 NA       
#>  8 Claydon, Shar… ALP   Newcastle                5                  14 NA       
#>  9 Taylor, Angus… LP    Hume                     3                  14 NA       
#> 10 Plibersek, Ta… ALP   Sydney                   3                   8 NA       
#> # ℹ 86 more rows
#> # ℹ 3 more variables: answers <lgl>, speeches <lgl>, avg_speech_length <dbl>

# Party representation in sample
party_summary <- tbls$members |>
  filter(!is.na(party)) |>
  count(party, sort = TRUE) |>
  collect()

cat("\nParty representation in sample data:\n")
#> 
#> Party representation in sample data:
print(party_summary)
#> # A tibble: 8 × 2
#>   party     n
#>   <chr> <int>
#> 1 ALP      54
#> 2 LP       13
#> 3 IND       9
#> 4 LNP       9
#> 5 NATS      5
#> 6 AG        3
#> 7 CA        1
#> 8 KAP       1

Step 7: Text Search and Analysis

Simple Content Search

# Search for specific topics using database-friendly approach
# Note: grepl() doesn't work with databases, so we use direct SQL
climate_mentions <- DBI::dbGetQuery(con, "
  SELECT 
    m.full_name,
    m.party, 
    s.session_date,
    sp.content_length,
    SUBSTR(sp.content, 1, 100) || '...' as content_preview
  FROM speeches sp
  LEFT JOIN members m ON sp.member_id = m.member_id  
  LEFT JOIN sessions s ON sp.session_id = s.session_id
  WHERE LOWER(sp.content) LIKE '%climate%'
  ORDER BY s.session_date DESC
  LIMIT 10
")

if (nrow(climate_mentions) > 0) {
  cat("Recent 'climate' mentions:\n")
  print(climate_mentions)
} else {
  cat("No 'climate' mentions found in current dataset\n")
  
  # Try a broader search
  any_mentions <- DBI::dbGetQuery(con, "
    SELECT COUNT(*) as total_speeches
    FROM speeches 
    WHERE content IS NOT NULL AND LENGTH(content) > 10
  ")
  cat("Total speeches with content:", any_mentions$total_speeches, "\n")
}
#> Recent 'climate' mentions:
#>                   full_name party session_date content_length
#> 1                      <NA>  <NA>        20172           1625
#> 2                      <NA>  <NA>        20172           6516
#> 3                      <NA>  <NA>        20172          12979
#> 4                      <NA>  <NA>        20172           1482
#> 5       Bandt, Adam Paul MP    AG        20172            523
#> 6      Albanese, Anthony MP   ALP        20172           2308
#> 7           Sitou, Sally MP   ALP        20172            216
#> 8  Plibersek, Tanya Joan MP   ALP        20172           1722
#> 9                      <NA>  <NA>        20172           4690
#> 10          Wilson, Rick MP    LP        20172           6602
#>                                                                                            content_preview
#> 1  by leave—I move:  That standing order 133 (b) (deferred divisions) be suspended for this sitting. I ...
#> 2  Honourable members interjecting—  Just hold your horses. Members on my right are not helping this si...
#> 3  I move:  That this bill be now read a second time. I'd like to add that Labor is committed to fixing...
#> 4  Right now, this country is run in the interests of billionaires and big corporations. While single p...
#> 5  My question is to the Prime Minister. Your government has approved over 30 new coal and gas projects...
#> 6  What absolute nonsense from the member for Melbourne, the leader of a political party that stopped c...
#> 7  My question is to the Minister for Climate Change and Energy. How has the Albanese Labor government ...
#> 8  I thank the member for her question. The number of coal mines or extensions approved is 10. Four hav...
#> 9  Mr Hamilton interjecting—  The DEPUTY SPEAKER: Member for Groom, do you want to join in the Speaker'...
#> 10 I move, as an amendment to the amendment moved by the member for Fairfax:  That all words after 'whi...

# Example of other search terms you might use
search_terms <- c("economy", "health", "education", "budget")
for (term in search_terms) {
  count_query <- paste0("
    SELECT COUNT(*) as count 
    FROM speeches 
    WHERE LOWER(content) LIKE '%", tolower(term), "%'
  ")
  
  result <- DBI::dbGetQuery(con, count_query)
  if (result$count > 0) {
    cat("Found", result$count, "mentions of '", term, "'\n")
  }
}
#> Found 61 mentions of ' economy '
#> Found 106 mentions of ' health '
#> Found 74 mentions of ' education '
#> Found 102 mentions of ' budget '

Alternative Database-Friendly Search Pattern

For better performance with large databases, you can also use this pattern:

# More efficient search for large databases
# Search with LIMIT in SQL rather than collecting all results
climate_search <- DBI::dbGetQuery(con, "
  SELECT 
    m.full_name,
    m.party, 
    s.session_date,
    sp.content_length
  FROM speeches sp
  LEFT JOIN members m ON sp.member_id = m.member_id  
  LEFT JOIN sessions s ON sp.session_id = s.session_id
  WHERE sp.content LIKE '%climate%'
  ORDER BY s.session_date DESC
  LIMIT 10
")

if (nrow(climate_search) > 0) {
  cat("Climate mentions using direct SQL:\n")
  print(climate_search)
} else {
  cat("No climate mentions found\n")
}
#> Climate mentions using direct SQL:
#>                   full_name party session_date content_length
#> 1                      <NA>  <NA>        20172           1625
#> 2                      <NA>  <NA>        20172           6516
#> 3                      <NA>  <NA>        20172          12979
#> 4                      <NA>  <NA>        20172           1482
#> 5       Bandt, Adam Paul MP    AG        20172            523
#> 6      Albanese, Anthony MP   ALP        20172           2308
#> 7           Sitou, Sally MP   ALP        20172            216
#> 8  Plibersek, Tanya Joan MP   ALP        20172           1722
#> 9                      <NA>  <NA>        20172           4690
#> 10          Wilson, Rick MP    LP        20172           6602

Topic-Based Analysis

# Questions vs Answers by party
qa_balance <- tbls$speeches |>
  left_join(tbls$members, by = "member_id") |>
  group_by(party) |>
  summarise(
    questions = sum(is_question, na.rm = TRUE),
    answers = sum(is_answer, na.rm = TRUE),
    question_answer_ratio = round(
      sum(is_question, na.rm = TRUE) / pmax(sum(is_answer, na.rm = TRUE), 1), 2
    ),
    .groups = "drop"
  ) |>
  filter(questions + answers >= 10) |>
  arrange(desc(question_answer_ratio)) |>
  collect()
#> Warning: Missing values are always removed in SQL aggregation functions.
#> Use `na.rm = TRUE` to silence this warning
#> This warning is displayed once every 8 hours.

print(qa_balance)
#> # A tibble: 0 × 4
#> # ℹ 4 variables: party <chr>, questions <lgl>, answers <lgl>,
#> #   question_answer_ratio <lgl>

Step 8: Data Export for External Analysis

Export to CSV for R Analysis

# Export member statistics
member_stats <- get_top_speakers(con, limit = 100)
write.csv(member_stats, "member_activity_stats.csv", row.names = FALSE)

# Export session summary
session_summary <- tbls$sessions |>
  left_join(
    tbls$speeches |> 
      group_by(session_id) |> 
      summarise(
        speech_count = n(),
        question_count = sum(is_question, na.rm = TRUE),
        avg_speech_length = round(mean(content_length, na.rm = TRUE), 1),
        .groups = "drop"
      ),
    by = "session_id"
  ) |>
  collect()

write.csv(session_summary, "session_summary.csv", row.names = FALSE)

cat("Data exported to CSV files for external analysis\n")
#> Data exported to CSV files for external analysis

Step 9: Database Maintenance

Check Import Status

# Check for any failed imports
if (exists("combined_results")) {
  failed_imports <- combined_results[!combined_results$success, ]
  
  if (nrow(failed_imports) > 0) {
    cat("Files that failed to import:\n")
    print(failed_imports[c("filename", "processed_at")])
    
    # Retry failed imports
    cat("\nRetrying failed imports...\n")
    retry_results <- import_hansard_batch(
      failed_imports$file_path, 
      con, 
      validate = TRUE,
      force_reimport = TRUE
    )
    print(retry_results)
  } else {
    cat("All files imported successfully!\n")
  }
}
#> Files that failed to import:
#> # A tibble: 5 × 2
#>   filename                  processed_at       
#>   <chr>                     <dttm>             
#> 1 2024-02-08_edit_step7.csv 2025-08-06 03:18:28
#> 2 2024-05-14_edit_step7.csv 2025-08-06 03:18:28
#> 3 2024-11-21_edit_step7.csv 2025-08-06 03:18:28
#> 4 2025-02-06_edit_step7.csv 2025-08-06 03:18:28
#> 5 2025-03-25_edit_step7.csv 2025-08-06 03:18:28
#> 
#> Retrying failed imports...
#> Processing: 2024-02-08_edit_step7.csv
#>   v Successfully imported 243 records
#> Processing: 2024-05-14_edit_step7.csv
#>   v Successfully imported 191 records
#> Processing: 2024-11-21_edit_step7.csv
#>   v Successfully imported 166 records
#> Processing: 2025-02-06_edit_step7.csv
#>   v Successfully imported 182 records
#> Processing: 2025-03-25_edit_step7.csv
#>   v Successfully imported 189 records
#> 
#> === Import Summary ===
#> Successful: 5/5 (100%)
#> # A tibble: 5 × 4
#>   file_path                                 filename success processed_at       
#>   <chr>                                     <chr>    <lgl>   <dttm>             
#> 1 /home/runner/work/_temp/Library/hansardR… 2024-02… TRUE    2025-08-06 03:18:31
#> 2 /home/runner/work/_temp/Library/hansardR… 2024-05… TRUE    2025-08-06 03:18:31
#> 3 /home/runner/work/_temp/Library/hansardR… 2024-11… TRUE    2025-08-06 03:18:31
#> 4 /home/runner/work/_temp/Library/hansardR… 2025-02… TRUE    2025-08-06 03:18:31
#> 5 /home/runner/work/_temp/Library/hansardR… 2025-03… TRUE    2025-08-06 03:18:32

Database Optimization

# Get database file size
if (file.exists(db_path)) {
  db_size_mb <- round(file.size(db_path) / 1024^2, 1)
  cat("Database size:", db_size_mb, "MB\n")
}
#> Database size: 4.2 MB

# Optimize database (SQLite maintenance)
DBI::dbExecute(con, "VACUUM;")
#> [1] 0
DBI::dbExecute(con, "ANALYZE;")
#> [1] 0

cat("Database optimized\n")
#> Database optimized

Cleanup

# Close database connection when finished
dbDisconnect(con)
cat("Database connection closed\n")
#> Database connection closed

Next Steps

With your Hansard data now in a structured database, you can:

Conduct longitudinal analysis of parliamentary discourse
Perform text mining and sentiment analysis on speech content
Analyse party dynamics and question-answer patterns
Study member behaviour across different topics and time periods
Export data for use with other text analysis tools

Troubleshooting

Common Issues

File validation failures: - Check file encoding (should be UTF-8) - Verify column names match expected schema - Ensure date format in filename is YYYY-MM-DD

Import errors: - Use force_reimport = TRUE to overwrite existing sessions - Check database permissions and disk space - Validate file structure before import

Memory issues with large datasets: - Process years individually rather than all at once - Use progress = FALSE to reduce overhead - Close and reconnect database periodically for long imports

Query performance: - Database includes optimized indexes for common queries - Use collect() only when you need data in R - Filter early in dplyr chains to reduce data transfer

For additional help, see the package documentation: help(package = "hansardR")