§1 Introduction to Digital Humanities

1 Slides

2 Digital Humanities Analysis with R

Let’s explore some fascinating examples of how computational methods can be applied to humanities questions.

2.1 Example 1: Text Analysis of Literary Works

Code

library(pacman)
p_load(tidyverse, tidytext, wordcloud, gutenbergr, scales, hrbrthemes)

# Function to download and process text
# gutenberg_works(title == "The Age of Innocence")
inno <- gutenberg_download(541)

# Analyze word frequencies
word_freq <- inno %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE) %>%
  top_n(20, n)

# Create a bar plot of word frequencies
p <- ggplot(word_freq, aes(x = reorder(word, n), y = n)) +
  geom_col(fill = "#69b3a2", width = 0.7) +
  geom_text(aes(label = n), hjust = -0.3, size = 3) +
  coord_flip() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(x = NULL, y = "Frequency", 
       title = "Top 20 Most Frequent Words in The Age of Innocence",
       subtitle = "After removing common stop words",
       caption = "Source: Project Gutenberg") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", margin = margin(b = 10)),
    plot.subtitle = element_text(size = 12, color = "gray50", margin = margin(b = 20)),
    plot.caption = element_text(size = 10, color = "gray50", margin = margin(t = 10)),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.margin = margin(20, 20, 20, 20)
  )

# Print the plot
print(p)

This word cloud visualizes the most frequent words in The Age of Innocence, giving us a quick insight into common themes and vocabulary.

Discussion

What words stand out to you in this visualization?
How might this kind of analysis complement traditional close reading of literary works?
What limitations might this approach have for understanding the author’s language?

2.2 Example 2: Sentiment Analysis of Jane Austen’s Novels

Code

library(janeaustenr)
library(tidyverse)
library(tidytext)
library(hrbrthemes)

# Combine Austen's novels
austen_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))
  ) %>%
  ungroup()

# Perform sentiment analysis
austen_sentiment <- austen_books %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("bing"), multiple="all") %>%
  count(book, index = linenumber %/% 100, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

# Plot sentiment over narrative time
ggplot(austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(
    title = "Sentiment Analysis of Jane Austen's Novels",
    subtitle = "Emotional trajectory throughout the narrative",
    x = "Narrative Time",
    y = "Sentiment Score",
    caption = "Data: janeaustenr package | Analysis: tidytext"
  ) +
  theme_minimal() +
  theme(
    text = element_text(size = 12),
    strip.text = element_text(size = 14, face = "bold"),
    panel.grid.minor = element_blank(),
    plot.title = element_text(size = 18, face = "bold"),
    plot.subtitle = element_text(size = 14, color = "gray50"),
    plot.caption = element_text(size = 10, color = "gray50"),
    axis.title = element_text(size = 12, face = "bold"),
    plot.margin = margin(20, 20, 20, 20)
  ) +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(expand = expansion(mult = c(0.1, 0.1)))

Code

# Display summary statistics
austen_summary <- austen_sentiment %>%
  group_by(book) %>%
  summarize(
    mean_sentiment = mean(sentiment),
    max_sentiment = max(sentiment),
    min_sentiment = min(sentiment)
  )

knitr::kable(austen_summary, caption = "Summary Statistics of Sentiment Analysis", digits = 2)

Summary Statistics of Sentiment Analysis
book	mean_sentiment	max_sentiment	min_sentiment
Sense & Sensibility	9.94	51	-26
Pride & Prejudice	10.69	42	-31
Mansfield Park	12.47	61	-45
Emma	14.40	48	-31
Northanger Abbey	9.19	47	-33
Persuasion	15.14	59	-13

This visualization shows the emotional trajectory of Jane Austen’s novels over their narrative time.

Discussion: Narrative Emotions

What patterns do you notice in the emotional arcs of Austen’s novels?
How might this type of analysis enhance our understanding of narrative structure?
What challenges might arise in applying sentiment analysis to historical texts?

2.3 Example 3: Network Analysis of “Empresses in the Palace” Characters

Code

library(pacman)
p_load(jsonlite,igraph,ggraph,tidyverse,showtext,ggrepel,cowplot)

# Add Noto Sans CJK font
font_add_google("Noto Sans SC", "Noto Sans SC")
showtext_auto()


# Read the JSON data
relation_data <- fromJSON("../book/data/relation.json")

relation_data$nodes <- relation_data$nodes %>% rename(Bio="角色描述")

relation_data$nodes <- relation_data$nodes%>%
mutate(Alliance = ifelse(Alliance=="皇后阵容","皇后阵营",Alliance))

# Ensure that the 'source' and 'target' in edges match the 'ID' in nodes
relation_data$edges$source <- as.character(relation_data$edges$source)
relation_data$edges$target <- as.character(relation_data$edges$target)
relation_data$nodes$ID <- as.character(relation_data$nodes$ID)
edges <- relation_data$edges %>% select(-Relationship)
# Set the ID as the row names for the nodes dataframe
rownames(relation_data$nodes) <- relation_data$nodes$ID

# Create the directed graph
empresses_graph <- graph_from_data_frame(d = edges,
                                         vertices = relation_data$nodes$ID,
                                         directed = TRUE)

# Set node attributes
V(empresses_graph)$Alliance <- relation_data$nodes$Alliance
V(empresses_graph)$Label <- relation_data$nodes$Label

# Create a color palette for alliances
alliance_colors <- c(
  "皇室成员" = "#4E79A7",  # Royal Blue
  "皇后阵营" = "#F28E2B",  # Warm Orange
  "甄嬛阵营" = "#E15759",  # Soft Red
  "华妃阵营" = "#76B7B2"   # Teal
)

# Calculate node size based on total degree centrality (in + out)
V(empresses_graph)$size <- degree(empresses_graph, mode = "total") * 0.5 + 3
E(empresses_graph)$Relationship <- relation_data$edges$Relationship

# Plot the network
set.seed(123) # for reproducibility
plot <- ggraph(empresses_graph, layout = "fr") +
  geom_edge_link0(aes(edge_color = Relationship), 
                  arrow = arrow(length = unit(0.2, "inches"),
                  ends = "last", type = "closed"), 
                  show.legend = FALSE, 
                  width = 1) +
  geom_node_point(aes(color = Alliance, size = size), 
                  shape = 20, show.legend = FALSE) +
  geom_node_text(aes(label = Label), repel = TRUE, size = 6) +
  scale_color_manual(values = alliance_colors) +
  scale_edge_colour_discrete() +
  scale_size_continuous(range = c(5, 20)) +
  theme_void() +
  labs(title = "Character Network in 'Empresses in the Palace'",
       subtitle = "Directed relationships between key figures in the Chinese drama") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 14),
    legend.text = element_text(size = 8),
    legend.title = element_text(size = 10, face = "bold")
  )

# Plot
plot

This visualization shows the complex network of relationships between characters in the Chinese drama “Empresses in the Palace” (甄嬛传).

Discussion: Character Networks in Historical Dramas

What insights can we gain from this network visualization? What information is presented in the plot?
Which characters appear to be central to the network? How might their positions reflect their importance in the narrative?
How could this type of analysis complement traditional literary analysis of historical dramas or novels?
What limitations might this network analysis have in representing the complex relationships and dynamics of the story?

Let’s verify our impressions!

Code

library(pacman)
p_load(jsonlite,igraph,ggraph,tidyverse,showtext,ggrepel,cowplot)
# Create dummy data for legends
alliance_data <- data.frame(
  Alliance = levels(factor(V(empresses_graph)$Alliance)),
  x = 1,
  y = 1:length(levels(factor(V(empresses_graph)$Alliance)))
)

size_data <- data.frame(
  size = c(5, 10, 15, 20),
  x = 1,
  y = 1:4
)

relationship_data <- data.frame(
  Relationship = levels(factor(E(empresses_graph)$Relationship)),
  x = 1,
  y = 1:length(levels(factor(E(empresses_graph)$Relationship)))
)

# Create separate legends
node_legend <- cowplot::get_legend(
  ggplot() +
    geom_point(data = alliance_data, aes(x, y, color = Alliance), size = 5) +
    geom_point(data = size_data, aes(x, y, size = size), color = "black") +
    scale_color_manual(values = alliance_colors, name = "Alliance") +
    scale_size_continuous(range = c(5, 20), name = "Connections") +
    guides(
      color = guide_legend(override.aes = list(size = 5)),
      size = guide_legend(override.aes = list(color = "steelblue"))
    ) +
    theme(
      legend.background = element_blank(),
      legend.key = element_blank(),
      legend.spacing.y = unit(0.5, "cm")
    )
)

edge_legend <- cowplot::get_legend(
  ggplot(relationship_data, aes(x, y, color = Relationship)) +
    geom_segment(aes(x = 0, xend = 1, yend = y),
                 arrow = arrow(length = unit(0.2, "inches"), type = "closed")) +
    scale_color_discrete(name = "Relationship") +
    theme(
      legend.background = element_blank(),
      legend.key = element_blank(),
      legend.spacing.y = unit(0.2, "cm"),
      legend.text = element_text(margin = margin(r = 15))
    )
)

# Combine plot and legends
combined_legend <- plot_grid(
  node_legend, 
  edge_legend, 
  ncol = 1, 
  rel_heights = c(1, 1.5),
  align = 'v',
  axis = 'l'
)

# Combine main plot and legends
final_plot <- plot_grid(
  plot, 
  combined_legend,
  rel_widths = c(5, 2),
  align = 'h',
  axis = 'tb'
)

# Display the final plot
final_plot

Code

library(pacman)
p_load(jsonlite,igraph,ggraph,tidyverse,showtext,ggrepel,cowplot)

# Display summary statistics
character_summary <- tibble(
  Alliance = names(table(V(empresses_graph)$Alliance)),
  Count = as.numeric(table(V(empresses_graph)$Alliance))
)%>%
  arrange(desc(Count)) 

knitr::kable(character_summary, caption = "Summary of Character Alliances")

Summary of Character Alliances
Alliance	Count
甄嬛阵营	22
皇后阵营	12
华妃阵营	6
皇室成员	4

Code

# Calculate and display top 5 characters by total degree centrality
top_characters <- tibble(
  Character = V(empresses_graph)$Label,
  InDegree = degree(empresses_graph, mode = "in"),
  OutDegree = degree(empresses_graph, mode = "out"),
  TotalDegree = degree(empresses_graph, mode = "total")
) %>%
  arrange(desc(TotalDegree)) %>%
  left_join(.,relation_data$nodes%>%select(Label,Title,Alliance,Bio),by=(c("Character"="Label")))%>%
  slice_head(n = 5)

knitr::kable(top_characters, caption = "Top 5 Characters by Total Connections")

Top 5 Characters by Total Connections
Character	InDegree	OutDegree	TotalDegree	Title	Alliance	Bio
甄嬛	16	22	38	皇贵妃	甄嬛阵营	因容貌酷似已逝的纯元皇后被选中，一直蒙受恩宠。对皇帝真心实意。因为在后宫算计中受到影响，一步步变得城府至深，结果却是她的一厢情愿，离开后宫，遇到了果郡王，心心相爱。后来设计回宫。经历了后宫无数的明争暗斗，勾心斗角，害死皇帝，扶养子登位，终于成为太后。
雍正	13	18	31	皇上	皇室成员	九龙夺嫡之后终于登上了宝座。在他的眼中任何人都可以成为他巩固江山的棋子。纯元皇后去世多年，痴情的皇帝对她依然念念不忘，于是对酷似纯元皇后的甄嬛愈加宠爱，渐渐超过了华妃。甄嬛回宫，皇上对其极其信任，百依百顺。但最后还是被枕边人算计。
宜修	8	11	19	皇后	皇后阵营	纯元皇后的庶出妹妹。外表稳重端庄，但擅于隐忍，城府之深，极爱后位，后宫妃子皆为她的棋子。利用甄嬛打败了华妃，继而将甄嬛赶出宫外。甄嬛回宫后继续步步为营，但最后因害死纯元皇后的事情东窗事发，在太后的力保之下没被废后，但永生禁足于景仁宫，二人死生不复相见，最后新帝登基，心悸而死。
允礼	7	7	14	果亲王	皇室成员	果郡王不同于宫中争权夺势的皇室子弟，他庆幸自己并非帝王之身，不必将朝政琐事萦绕于身，他不求娇妻美妾如云，只盼与爱人厮守到老。果郡王爱着甄嬛，却又不能僭越宫廷之礼。他善良、痴情、风流倜傥，终其一生都是在寻求一份爱，奋不顾身地保护一份爱，说他是为爱而生也不为过。
年世兰	6	7	13	正三品贵嫔	华妃阵营	大将军的妹妹，仗着身世和宠爱，对其他的妃嫔不择手段，但本性并不坏，因为孩子被打掉而痛恨端妃。甄嬛在其打入冷宫后还不能怀孕的事实告诉她，绝望的华妃撞墙而死。最终她被追封为敦肃皇贵妃。

3 Conclusion

These demonstrations showcase just a few of the exciting possibilities that digital humanities offers for exploring and analyzing humanities data. By combining computational methods with traditional humanities scholarship, we can uncover new patterns, ask novel questions, and gain fresh insights into cultural and historical materials.

Final Reflection

Consider the examples we’ve explored today:

Which technique (word frequency analysis, sentiment analysis, or network analysis) do you find most intriguing? Why?
Can you think of a humanities question or topic from your own interests that might benefit from one of these computational approaches?

Share your thoughts with a partner or the class if time allows.