Notes V.2.0.0

Rewrote Notes
This commit is contained in:
2026-01-07 13:51:33 +01:00
parent c1878069fd
commit bcd2ddfe42
13 changed files with 787 additions and 623 deletions

Binary file not shown.

View File

@@ -3,6 +3,7 @@
\def\papertitle{Information Systems for Engineers} \def\papertitle{Information Systems for Engineers}
\def\theorytitle{Theory} \def\theorytitle{Theory}
\def\notetitle{Notes:-}
\def\corollarytitle{Corollary} \def\corollarytitle{Corollary}
\def\proposaltitle{Proposal} \def\proposaltitle{Proposal}
\def\claimtitle{Claim} \def\claimtitle{Claim}

View File

@@ -2,6 +2,7 @@
% PACKAGE IMPORTS % PACKAGE IMPORTS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% TODO: Change Bable Language if needed
% \usepackage[ngerman]{babel} % \usepackage[ngerman]{babel}
\usepackage[macchiato, styleAll]{catppuccinpalette} \usepackage[macchiato, styleAll]{catppuccinpalette}
\usepackage[tmargin=2cm,rmargin=1in,lmargin=1in,margin=0.85in,bmargin=2cm,footskip=.2in]{geometry} \usepackage[tmargin=2cm,rmargin=1in,lmargin=1in,margin=0.85in,bmargin=2cm,footskip=.2in]{geometry}
@@ -34,7 +35,6 @@
\usepackage{xifthen} \usepackage{xifthen}
\usepackage{pdfpages} \usepackage{pdfpages}
\usepackage{transparent} \usepackage{transparent}
\usepackage{tikzsymbols} \usepackage{tikzsymbols}
\renewcommand\qedsymbol{$\Laughey$} \renewcommand\qedsymbol{$\Laughey$}
@@ -69,15 +69,15 @@
colupper = CtpText, colupper = CtpText,
frame hidden, frame hidden,
boxrule = 0sp, boxrule = 0sp,
borderline west = {2pt}{0pt}{CtpRed}, borderline west = {2pt}{0pt}{CtpLavender},
sharp corners, sharp corners,
detach title, detach title,
before upper = \tcbtitle\par\smallskip, before upper = \tcbtitle\par\smallskip,
coltitle = CtpRed, coltitle = CtpLavender,
fonttitle = \bfseries\sffamily, fonttitle = \bfseries\sffamily,
description font = \mdseries, description font = \mdseries,
separator sign none, separator sign none,
segmentation style={solid, CtpRed}, segmentation style={solid, CtpLavender},
} }
{th} {th}
@@ -86,19 +86,19 @@
{% {%
enhanced, enhanced,
breakable, breakable,
colback = CtpRosewater, colback = CtpSurface0,
colupper = CtpSubtext0, colupper = CtpText,
frame hidden, frame hidden,
boxrule = 0sp, boxrule = 0sp,
borderline west = {2pt}{0pt}{CtpRed}, borderline west = {2pt}{0pt}{CtpLavender},
sharp corners, sharp corners,
detach title, detach title,
before upper = \tcbtitle\par\smallskip, before upper = \tcbtitle\par\smallskip,
coltitle = CtpRed, coltitle = CtpLavender,
fonttitle = \bfseries\sffamily, fonttitle = \bfseries\sffamily,
description font = \mdseries, description font = \mdseries,
separator sign none, separator sign none,
segmentation style={solid, CtpRed}, segmentation style={solid, CtpLavender},
} }
{th} {th}
@@ -108,11 +108,11 @@
{% {%
enhanced enhanced
,breakable ,breakable
,colback = CtpRosewater ,colback = CtpSurface0
,colupper = CtpSubtext0 ,colupper = CtpText
,frame hidden ,frame hidden
,boxrule = 0sp ,boxrule = 0sp
,borderline west = {2pt}{0pt}{CtpRed} ,borderline west = {2pt}{0pt}{CtpLavender}
,sharp corners ,sharp corners
,description font = \mdseries ,description font = \mdseries
,separator sign none ,separator sign none
@@ -120,7 +120,7 @@
%================================ %================================
% Corollery % Corollary
%================================ %================================
\tcbuselibrary{theorems,skins,hooks} \tcbuselibrary{theorems,skins,hooks}
@@ -128,7 +128,8 @@
{% {%
enhanced enhanced
,breakable ,breakable
,colback = CtpMauve!10 ,colback = CtpSurface0
,colupper = CtpText
,frame hidden ,frame hidden
,boxrule = 0sp ,boxrule = 0sp
,borderline west = {2pt}{0pt}{CtpMauve!85!black} ,borderline west = {2pt}{0pt}{CtpMauve!85!black}
@@ -147,7 +148,8 @@
{% {%
enhanced enhanced
,breakable ,breakable
,colback = CtpMauve!10 ,colback = CtpSurface0
,colupper = CtpText
,frame hidden ,frame hidden
,boxrule = 0sp ,boxrule = 0sp
,borderline west = {2pt}{0pt}{CtpMauve!85!black} ,borderline west = {2pt}{0pt}{CtpMauve!85!black}
@@ -172,7 +174,8 @@
{% {%
enhanced, enhanced,
breakable, breakable,
colback = CtpRosewater, colback = CtpSurface0,
colupper = CtpText,
frame hidden, frame hidden,
boxrule = 0sp, boxrule = 0sp,
borderline west = {2pt}{0pt}{CtpPeach}, borderline west = {2pt}{0pt}{CtpPeach},
@@ -192,7 +195,8 @@
{% {%
enhanced, enhanced,
breakable, breakable,
colback = CtpRosewater, colback = CtpSurface0,
colupper = CtpText,
frame hidden, frame hidden,
boxrule = 0sp, boxrule = 0sp,
borderline west = {2pt}{0pt}{CtpPeach}, borderline west = {2pt}{0pt}{CtpPeach},
@@ -262,7 +266,8 @@
{% {%
enhanced enhanced
,breakable ,breakable
,colback = CtpGreen!10 ,colback = CtpSurface0
,colupper = CtpText
,frame hidden ,frame hidden
,boxrule = 0sp ,boxrule = 0sp
,borderline west = {2pt}{0pt}{CtpGreen} ,borderline west = {2pt}{0pt}{CtpGreen}
@@ -288,18 +293,19 @@
{% {%
enhanced, enhanced,
breakable, breakable,
colback = CtpFlamingo, colback = CtpSurface0,
colupper = CtpText,
frame hidden, frame hidden,
boxrule = 0sp, boxrule = 0sp,
borderline west = {2pt}{0pt}{CtpPink}, borderline west = {2pt}{0pt}{CtpBlue},
sharp corners, sharp corners,
detach title, detach title,
before upper = \tcbtitle\par\smallskip, before upper = \tcbtitle\par\smallskip,
coltitle = CtpPink, coltitle = CtpBlue,
fonttitle = \bfseries\sffamily, fonttitle = \bfseries\sffamily,
description font = \mdseries, description font = \mdseries,
separator sign none, separator sign none,
segmentation style={solid, CtpPink}, segmentation style={solid, CtpBlue},
} }
{th} {th}
@@ -308,18 +314,19 @@
{% {%
enhanced, enhanced,
breakable, breakable,
colback = CtpFlamingo, colback = CtpSurface0,
colupper = CtpText,
frame hidden, frame hidden,
boxrule = 0sp, boxrule = 0sp,
borderline west = {2pt}{0pt}{CtpPink}, borderline west = {2pt}{0pt}{CtpBlue},
sharp corners, sharp corners,
detach title, detach title,
before upper = \tcbtitle\par\smallskip, before upper = \tcbtitle\par\smallskip,
coltitle = CtpPink, coltitle = CtpBlue,
fonttitle = \bfseries\sffamily, fonttitle = \bfseries\sffamily,
description font = \mdseries, description font = \mdseries,
separator sign none, separator sign none,
segmentation style={solid, CtpPink}, segmentation style={solid, CtpBlue},
} }
{th} {th}
@@ -329,10 +336,11 @@
\newtcbtheorem[number within=section]{Example}{\exampletitle} \newtcbtheorem[number within=section]{Example}{\exampletitle}
{% {%
colback = CtpSky!10 colback = CtpSurface0
,colupper = CtpText
,breakable ,breakable
,colframe = CtpSky ,colframe = CtpTeal
,coltitle = CtpSky ,coltitle = CtpTeal
,boxrule = 1pt ,boxrule = 1pt
,sharp corners ,sharp corners
,detach title ,detach title
@@ -346,10 +354,11 @@
\newtcbtheorem[number within=chapter]{example}{\exampletitle} \newtcbtheorem[number within=chapter]{example}{\exampletitle}
{% {%
colback = CtpSky!10 colback = CtpSurface0
,colupper = CtpText
,breakable ,breakable
,colframe = CtpSky ,colframe = CtpTeal
,coltitle = CtpSky ,coltitle = CtpTeal
,boxrule = 1pt ,boxrule = 1pt
,sharp corners ,sharp corners
,detach title ,detach title
@@ -414,8 +423,9 @@
\makeatletter \makeatletter
\newtcbtheorem{question}{\questiontitle}{enhanced, \newtcbtheorem{question}{\questiontitle}{enhanced,
breakable, breakable,
colback=white, colback=CtpSurface0,
colframe=CtpSaphire!80!black, colupper=CtpText,
colframe=CtpSaphire!80!CtpText,
attach boxed title to top left={yshift*=-\tcboxedtitleheight}, attach boxed title to top left={yshift*=-\tcboxedtitleheight},
fonttitle=\bfseries, fonttitle=\bfseries,
title={#2}, title={#2},
@@ -427,7 +437,7 @@
boxrule=0pt, boxrule=0pt,
}, },
underlay boxed title={% underlay boxed title={%
\path[fill=tcbcolframe] (title.south west)--(title.south east) \path[fill=CtpSaphire] (title.south west)--(title.south east)
to[out=0, in=180] ([xshift=5mm]title.east)-- to[out=0, in=180] ([xshift=5mm]title.east)--
(title.center-|frame.east) (title.center-|frame.east)
[rounded corners=\kvtcb@arc] |- [rounded corners=\kvtcb@arc] |-
@@ -444,8 +454,9 @@
\makeatletter \makeatletter
\newtcolorbox{solution}{enhanced, \newtcolorbox{solution}{enhanced,
breakable, breakable,
colback=white, colback=CtpSurface0,
colframe=CtpGreen!80!black, colupper=CtpText,
colframe=CtpGreen!80!CtpText,
attach boxed title to top left={yshift*=-\tcboxedtitleheight}, attach boxed title to top left={yshift*=-\tcboxedtitleheight},
title=Solution, title=Solution,
boxed title size=title, boxed title size=title,
@@ -456,7 +467,7 @@
boxrule=0pt, boxrule=0pt,
}, },
underlay boxed title={% underlay boxed title={%
\path[fill=tcbcolframe] (title.south west)--(title.south east) \path[fill=CtpGreen] (title.south west)--(title.south east)
to[out=0, in=180] ([xshift=5mm]title.east)-- to[out=0, in=180] ([xshift=5mm]title.east)--
(title.center-|frame.east) (title.center-|frame.east)
[rounded corners=\kvtcb@arc] |- [rounded corners=\kvtcb@arc] |-
@@ -472,7 +483,8 @@
\makeatletter \makeatletter
\newtcbtheorem{qstion}{\questiontitle}{enhanced, \newtcbtheorem{qstion}{\questiontitle}{enhanced,
breakable, breakable,
colback=white, colback=CtpSurface0,
colupper=CtpText,
colframe=CtpTeal, colframe=CtpTeal,
attach boxed title to top left={yshift*=-\tcboxedtitleheight}, attach boxed title to top left={yshift*=-\tcboxedtitleheight},
fonttitle=\bfseries, fonttitle=\bfseries,
@@ -485,7 +497,7 @@
boxrule=0pt, boxrule=0pt,
}, },
underlay boxed title={% underlay boxed title={%
\path[fill=tcbcolframe] (title.south west)--(title.south east) \path[fill=CtpTeal] (title.south west)--(title.south east)
to[out=0, in=180] ([xshift=5mm]title.east)-- to[out=0, in=180] ([xshift=5mm]title.east)--
(title.center-|frame.east) (title.center-|frame.east)
[rounded corners=\kvtcb@arc] |- [rounded corners=\kvtcb@arc] |-
@@ -498,19 +510,20 @@
\newtcbtheorem[number within=chapter]{wconc}{\wrongctitle}{ \newtcbtheorem[number within=chapter]{wconc}{\wrongctitle}{
breakable, breakable,
enhanced, enhanced,
colback=white, colback=CtpSurface0,
colframe=CtpRed, colupper=CtpText,
colframe=CtpTeal,
arc=0pt, arc=0pt,
outer arc=0pt, outer arc=0pt,
fonttitle=\bfseries\sffamily\large, fonttitle=\bfseries\sffamily\large,
colbacktitle=CtpRed, colbacktitle=CtpTeal,
attach boxed title to top left={}, attach boxed title to top left={},
boxed title style={ boxed title style={
enhanced, enhanced,
skin=enhancedfirst jigsaw, skin=enhancedfirst jigsaw,
arc=3pt, arc=3pt,
bottom=0pt, bottom=0pt,
interior style={fill=CtpRed} interior style={fill=CtpTeal}
}, },
#1 #1
}{def} }{def}
@@ -525,19 +538,21 @@
\tcbuselibrary{skins} \tcbuselibrary{skins}
\newtcolorbox{note}[1][]{% \newtcolorbox{note}[1][]{%
enhanced jigsaw, enhanced jigsaw,
colback=gray!20!white,% colback=CtpSurface0,%
colframe=gray!80!black, colframe=CtpSurface1,
colupper=CtpText,
size=small, size=small,
boxrule=1pt, boxrule=1pt,
title=\textbf{Bemerkung:-}, title=\textbf{\notetitle},
halign title=flush center, halign title=flush center,
coltitle=black, coltitle=CtpText,
breakable, breakable,
drop shadow=black!50!white, drop shadow=CtpCrust,
attach boxed title to top left={xshift=1cm,yshift=-\tcboxedtitleheight/2,yshifttext=-\tcboxedtitleheight/2}, attach boxed title to top left={xshift=1cm,yshift=-\tcboxedtitleheight/2,yshifttext=-\tcboxedtitleheight/2},
minipage boxed title=2.5cm, minipage boxed title=2.5cm,
boxed title style={% boxed title style={%
colback=white, colback=CtpSurface0,
colupper=CtpText,
size=fbox, size=fbox,
boxrule=1pt, boxrule=1pt,
boxsep=2pt, boxsep=2pt,
@@ -546,9 +561,9 @@
\coordinate (dotB) at ($(interior.east) + (0.5pt,0)$); \coordinate (dotB) at ($(interior.east) + (0.5pt,0)$);
\begin{scope} \begin{scope}
\clip (interior.north west) rectangle ([xshift=3ex]interior.east); \clip (interior.north west) rectangle ([xshift=3ex]interior.east);
\filldraw [white, blur shadow={shadow opacity=60, shadow yshift=-.75ex}, rounded corners=2pt] (interior.north west) rectangle (interior.south east); \filldraw [CtpBase, blur shadow={shadow opacity=60, shadow yshift=-.75ex}, rounded corners=2pt] (interior.north west) rectangle (interior.south east);
\end{scope} \end{scope}
\begin{scope}[gray!80!black] \begin{scope}[CtpSurface1]
\fill (dotA) circle (2pt); \fill (dotA) circle (2pt);
\fill (dotB) circle (2pt); \fill (dotB) circle (2pt);
\end{scope} \end{scope}
@@ -711,3 +726,4 @@
\end{tikzpicture}}% \end{tikzpicture}}%
\@starttoc{toc}} \@starttoc{toc}}
\makeatother \makeatother

View File

@@ -1,53 +1,113 @@
\chapter{Data Cubes} \chapter{Data Cubes}
The evolution of data management has transitioned through distinct eras, beginning with the Age of Transactions in the late 20th century, moving into the Age of Business Intelligence in the mid-1990s, and culminating in the modern Age of Big Data. This progression reflects a shift in focus from simple record-keeping to complex data-based decision support. Central to this transition is the distinction between two primary operational paradigms: Online Transaction Processing (OLTP) and Online Analytical Processing (OLAP). While OLTP systems are designed for consistent and reliable record-keeping through numerous small, fast writes, OLAP systems are built for historical, summarized, and consolidated data analysis characterized by long, heavy, and complex aggregating queries. The history of data management has progressed through several distinct eras, each defined by the primary utility of information. The initial phase, spanning from the 1970s to the 2000s, is characterized as the **Age of Transactions**. During this period, the development of the relational model, SQL, and the concept of data independence allowed organizations to maintain consistent and reliable records. These systems were designed to handle a continuous stream of updates, inserts, and deletions, necessitating a focus on concurrency and integrity.
To bridge the gap between raw data storage and analytical utility, database systems utilize specific architectural components. Views provide a logical layer over base tables, allowing users to interact with data without needing to manage its physical storage. Meanwhile, indexes serve as the engine for performance, enabling the rapid retrieval of specific tuples from massive datasets. In the context of OLAP, these concepts are expanded into the multidimensional data model, often visualized as a data cube. However, in the mid-1990s, a transition occurred toward the **Age of Business Intelligence**. As computational power increased and data volumes grew, corporate leadership—such as CEOs and CFOs—began requiring high-level insights rather than individual record access. This shift led to the emergence of specialized systems designed for data analysis, reporting, and dashboarding. This evolution eventually culminated in the modern **Age of Big Data**, characterized by massive scale and distributed processing.
\dfn{Online Transaction Processing (OLTP)}{A database paradigm focused on managing transaction-oriented applications. It is characterized by a high volume of short, fast, and concurrent transactions, primarily involving writes to small portions of normalized data to ensure consistent and reliable record-keeping.} \dfn{OLTP (Online Transactional Processing)}{A paradigm of data management focused on the day-to-day operational tasks of a business. It emphasizes record-keeping, high-frequency write operations, and the maintenance of data integrity through ACID properties.}
\dfn{Online Analytical Processing (OLAP)}{A paradigm designed to support multi-dimensional data analysis for decision-making. It typically involves complex, long-running queries on large portions of consolidated, historical data, often stored in denormalized structures like data cubes.} \dfn{OLAP (Online Analytical Processing)}{A data management paradigm designed for decision support and business intelligence. It involves the analysis of large, consolidated datasets that are typically frozen or "non-volatile," focusing on complex read-only queries rather than real-time updates.}
\section{Virtual Views and Data Interfaces} \section{Comparing OLTP and OLAP Paradigms}
Virtual views are a cornerstone of modern database management, acting as relations that are defined by a query rather than being stored physically on a disk. These views exist only logically; when a user queries a view, the system's query processor substitutes the view name with its underlying definition. This mechanism offers several advantages, including simplified query writing for end-users and enhanced security by restricting access to sensitive columns of a base table. To understand the necessity of specialized analytical structures like data cubes, one must distinguish between the operational requirements of OLTP and the analytical requirements of OLAP. In an OLTP environment, the system is "zoomed in" on specific, detailed records, such as an individual customer's order or a specific product's inventory level. The goal is consistent record-keeping. Because these systems are interactive and face end-users directly, performance is measured in milliseconds, and the design relies heavily on normalization to prevent update, insertion, and deletion anomalies.
\thm{View Preprocessing}{The process by which a preprocessor replaces an operand in a query that is a virtual view with a piece of a parse tree or expression tree representing the view's construction from base tables. This allows the query to be interpreted entirely in terms of physical storage.} In contrast, OLAP systems are "zoomed out," providing a high-level view of the entire organization. Instead of individual transactions, OLAP focuses on aggregated data—such as total sales by region per quarter. These systems are used for decision support, where the speed of a query might range from seconds to several hours. Redundancy is often embraced in OLAP to improve query efficiency, leading to the use of denormalized structures.
The management of views also extends to their modification. While most views are read-only, certain "updatable views" allow for insertions, deletions, or updates that are passed through to the underlying base tables. For a view to be considered updatable by standard SQL rules, it must generally be defined over a single relation without the use of aggregation or distinct clauses. Furthermore, database designers can use specialized triggers to define how modifications to a view should be handled if the standard pass-through logic is insufficient. \thm{The Trade-off of Freshness vs. Performance}{Running complex analytical queries directly on a live OLTP system is generally avoided because it consumes significant resources and slows down the day-to-day business operations. Consequently, data is extracted from OLTP systems and loaded into dedicated OLAP environments, typically during off-peak hours.}
\dfn{Instead-Of Trigger}{A specialized trigger defined on a virtual view that intercepts an attempted modification (INSERT, UPDATE, or DELETE). Instead of executing the modification on the view, the system executes a predefined sequence of actions on the underlying base tables to achieve the intended result.} \nt{Backups are critical in OLTP because losing transaction records means losing the business history. In OLAP, data can often be re-imported from the original sources, making backup procedures slightly less existential but still important for efficiency.}
\section{Performance Optimization through Indexing} \section{The Data Cube Model}
As databases grow into the terabyte and petabyte range, the cost of scanning every block of a relation to find specific information becomes prohibitive. Indexes are auxiliary data structures designed to mitigate this cost by allowing the system to locate tuples with specific search-key values without a full table scan. The primary motivation for indexing is the reduction of disk I/O, which is the dominant cost in query execution. The logical foundation of analytical processing is the **Data Cube**. While the term suggests a three-dimensional structure, a data cube is an n-dimensional hypercube that can accommodate any number of dimensions. Each dimension represents a different axis of analysis, such as time, geography, or product category.
\thm{Clustering Index Advantage}{An index is considered clustering if all tuples with a specific search-key value are stored on as few disk blocks as possible. A clustering index typically provides a massive speedup for range queries and selections because once the first matching block is found, the system can read subsequent matching tuples with minimal additional seek time or rotational latency.} \dfn{Dimension (Axis)}{A specific category or perspective used to organize data within a cube. Common dimensions include "Where" (Geography), "When" (Time), "Who" (Salesperson), and "What" (Product).}
Selecting the appropriate set of indexes is one of the most critical decisions for a database administrator. While indexes significantly accelerate read-heavy OLAP queries, they impose a maintenance penalty on OLTP operations. Every time a tuple is inserted, deleted, or updated, the associated indexes must also be modified, requiring additional disk writes. Therefore, the optimal indexing strategy depends on the specific workload—balancing the frequency of specific query forms against the frequency of modifications. \dfn{Member}{An individual value within a dimension. For example, "2024" is a member of the "Year" axis, and "Switzerland" is a member of the "Location" axis.}
\dfn{Secondary Index}{An index that does not determine the physical placement of records in the data file. Secondary indexes are necessarily dense, meaning they contain pointers to every record in the data file to facilitate retrieval by non-primary attributes.} At the intersection of specific members from every dimension lies a **cell**, which contains a numerical **value** or **fact**. For instance, a cell might store the information that in the year 2024, in Switzerland, seven servers were sold. This highly structured model ensures that for every combination of dimensional coordinates, a specific metric is available.
\section{The Multidimensional Data Model} \section{The Fact Table and Normal Forms}
In the analytical realm, data is often viewed through a multidimensional lens rather than as flat tables. This model organizes information around "facts"—events of interest like a specific sale—and "dimensions," which are the axes of the data space, such as time, location, or product type. This structure allows analysts to "slice and dice" data to find patterns. In a relational implementation, a data cube is represented physically as a **Fact Table**. This table serves as the central hub of the analytical schema. Every row in a fact table represents a single cell from the hypercube.
\dfn{Data Cube}{A multidimensional representation of data where each point represents a fact and the axes represent various dimensions of the data. A formal data cube includes not only the raw data but also precomputed aggregations across all subsets of dimensions.} \thm{Fact Tables and the Sixth Normal Form}{A fact table represents the highest level of data structure, often described as being in the Sixth Normal Form (6NF). In this state, every column representing a dimension is part of a composite primary key, and there is typically only one non-key column representing the recorded value.}
To support this model, ROLAP systems often use a "star schema." In this architecture, a central fact table contains the quantitative measures and foreign keys referencing "dimension tables." Dimension tables store descriptive information about the axes of the cube. If these dimension tables are further normalized, the structure is referred to as a "snowflake schema." In practice, fact tables may have multiple "measure" columns, such as revenue, profit, and quantity. This is often preferred over a strict 6NF to reduce the number of rows. The process of moving between a single-measure fact table and a multi-measure table is known as **pivoting** and **unpivoting**.
\thm{Slicing and Dicing}{Slicing is the act of picking a specific value for one or more dimensions to focus on a particular subset of the cube. Dicing involves selecting ranges for several dimensions to define a smaller, focused sub-cube for analysis.} \section{Operations on Data Cubes: Slicing and Dicing}
\section{Data Cube Operations and Implementation} Analyzing a cube involves reducing its complexity to a format that can be visualized, typically on a two-dimensional screen or a sheet of paper. This is achieved through slicing and dicing.
Navigating a data cube requires operations that change the level of granularity. "Roll-up" is the process of moving from a fine-grained view to a more summarized view by aggregating along a dimension (e.g., viewing sales by year instead of by month). Conversely, "drill-down" is the process of moving from a summarized view to a more detailed one. \dfn{Slicing}{The process of selecting a single member from a specific dimension, thereby reducing the dimensionality of the cube. It is analogous to taking a slice of a physical cake; if you slice a 3D cube on a specific year, you are left with a 2D square representing all other dimensions for that year.}
The implementation of these operations varies between Relational OLAP (ROLAP) and Multidimensional OLAP (MOLAP). ROLAP utilizes standard relational tables and extended SQL operators, whereas MOLAP uses specialized, non-relational structures that store the cube and its aggregates directly. One of the most powerful tools in this environment is the CUBE operator. \dfn{Dicing}{The arrangement of remaining dimensions onto the rows and columns of a cross-tabulated view (or pivot table). Dicing allows the user to explicitly define the grid they wish to see, such as putting "Salesperson" on the rows and "Year" on the columns.}
\dfn{CUBE Operator}{An extension of the GROUP BY clause that computes all possible aggregations across a set of dimensions. It effectively augments a fact table with "border" values representing "any" or summarized totals for every combination of the specified attributes.} \nt{Dimensions that are not used as dicers (rows or columns) must be set as slicers. Slicers act as filters for the entire view, ensuring that the displayed data is logically consistent with the user's focus.}
When implemented in SQL, the CUBE operator produces a result set where the "all" or summarized values are represented by NULLs in the grouping columns. This allows a single query to return the detailed data, subtotals for every dimension, and a grand total for the entire dataset. To manage the potentially explosive growth of data in a cube, many systems use "materialized views," which are views whose results are physically stored on disk and incrementally updated as the base data changes. \section{Hierarchies and Aggregation}
\thm{The Thomas Write Rule}{A principle in concurrency control that allows certain writes to be skipped if a write with a later timestamp is already in place, assuming that no other transaction needs to see the skipped value. This is relevant in the context of maintaining analytical data consistent with temporal versions.} Dimensions in a data cube are rarely flat lists; they are usually organized into **hierarchies**. For example, the "Location" dimension might move from City to Country to Continent to the whole World. The "Time" dimension might move from Day to Month to Quarter to Year.
In conclusion, the data cube represents a sophisticated integration of logical views, performance indexing, and multidimensional modeling. By leveraging these structures, database systems can provide the interactive, high-speed analysis required for modern decision support, even when operating on the vast scales of contemporary data warehouses. \dfn{Roll-up}{The action of moving up a hierarchy to a higher level of granularity. Rolling up from "City" to "Country" involves aggregating (summing, averaging, etc.) all city values into a single total for the country.}
\dfn{Drill-down}{The inverse of a roll-up, where a user moves down a hierarchy to view more specific details. Drilling down from "Year" might reveal the underlying data for each individual "Month."}
In a cross-tabulated view, these hierarchies are visualized through **subtotals**. Column hierarchies are often shown using "L-shaped" headers, while row hierarchies typically use indentation, bolding, and underlining to distinguish between levels.
\section{The ETL Process}
Data does not exist in a cube format by default. It must be moved from heterogeneous operational sources (ERP, CRM, files) into the OLAP system through a process known as **ETL**.
\thm{The ETL Verb}{ETL stands for Extract, Transform, and Load. It is often used as a verb in industry (e.g., "to ETL the data"), describing the complex engineering task of consolidating data into a unified analytical structure.}
\begin{itemize}
\item \textbf{Extract:} Connecting to source systems, often via gateways and firewalls, to pull raw data. This can be done through triggers, log extraction, or incremental updates.
\item \textbf{Transform:} The most labor-intensive phase, involving data cleaning (e.g., translating "Mr." and "Mister" into a single format), merging tables, filtering irrelevant records, and ensuring integrity constraints are met.
\item \textbf{Load:} Inserting the transformed data into the data cube, building indices to accelerate future queries, and potentially partitioning the data across multiple machines.
\end{itemize}
\section{Implementation Architectures: ROLAP and MOLAP}
There are two primary flavors of OLAP implementation. **MOLAP (Multidimensional OLAP)** uses specialized, non-relational data structures to store the cube. **ROLAP (Relational OLAP)** implements the cube logic on top of standard relational tables.
In ROLAP, the schema often takes one of two shapes:
\begin{enumerate}
\item \textbf{Star Schema:} A central fact table surrounded by "satellite" dimension tables. Each row in the fact table contains foreign keys pointing to the members in the dimension tables.
\item \textbf{Snowflake Schema:} A more normalized version of the star schema where dimension tables are themselves decomposed into further satellite tables (e.g., a City table pointing to a Country table).
\end{enumerate}
\thm{The Denormalized Fact Table}{For extreme performance, some designers join all satellite information back into a single, massive fact table. This creates significant redundancy but allows for extremely fast aggregations as no joins are required during query time.}
\section{SQL Extensions for Analytical Processing}
While standard SQL can be used to query fact tables, the code required to generate comprehensive reports with subtotals is often repetitive and prone to error. To address this, SQL was extended with specialized grouping functions.
\dfn{GROUPING SETS}{An extension of the GROUP BY clause that allows a user to specify multiple groupings in a single query. It is logically equivalent to a UNION of several GROUP BY queries, but more efficient.}
\thm{The CUBE Operator}{A syntactic sugar that generates the power set of all possible groupings for the specified attributes. For $n$ attributes, GROUP BY CUBE produces $2^n$ grouping sets, providing subtotals for every possible combination.}
\thm{The ROLLUP Operator}{A specialized version of grouping sets that follows a hierarchical path. For $n$ attributes, it produces $n+1$ grouping sets by progressively removing attributes from right to left. This is the ideal tool for generating totals and subtotals in a dimension hierarchy.}
\nt{The order of attributes matters significantly for ROLLUP but is irrelevant for CUBE. In a ROLLUP, you must list attributes from the most specific to the most general (e.g., City, Country, Continent).}
\section{Querying with MDX}
For environments that require a dedicated multidimensional language, **MDX (Multi-Dimensional Expressions)** is used. Unlike SQL, which treats data as sets of rows, MDX natively understands the concept of dimensions, members, and cells.
MDX allows a user to explicitly define which dimensions should appear on the columns and which on the rows of a result set. It uses a "WHERE" clause not for relational selection, but as a "slicer" to pick a specific coordinate in the cube. While advanced users might write MDX, most interact with it indirectly through the drag-and-drop interfaces of spreadsheet or business intelligence software.
\section{Standardized Reporting and XBRL}
Data cube technology has significant real-world applications in financial and sustainability reporting. Regulatory bodies, such as the SEC in the United States and ESMA in the European Union, now require companies to submit reports in standardized electronic formats like **XBRL (eXtensible Business Reporting Language)**.
\dfn{Inline XBRL}{A technology that embeds machine-readable data cube information within a standard human-readable HTML webpage. This allows a single document to be viewed by a human in a browser while its individual values can be extracted and reconstructed into a cube by a computer.}
In an XBRL report, every financial value is tagged with its dimensional coordinates: what the value is (e.g., Assets), who the company is (e.g., Coca-Cola), when the period was (e.g., Dec 31, 2024), and the currency used (e.g., USD). This creates a "table universe" of standardized, comparable data across entire industries.
\nt{The shift toward machine-readable reporting is often referred to as "interactive data," as it allows investors and regulators to automatically perform slicing and dicing operations across thousands of company filings simultaneously.}
In essence, data cube theory provides the bridge between the chaotic, high-velocity world of transactional data and the structured, strategic world of corporate decision-making. By transforming "wheat" (raw transaction logs) into "bread" (actionable reports), these systems enable a level of organizational insight that was impossible in the era of paper ledgers or simple flat-file databases.
To visualize this, think of a fact table as a collection of thousands of individual lego bricks. Each brick has a specific color, size, and shape (its dimensions). While they are just a pile of plastic on their own, the dicing and rolling-up operations allow us to assemble them into a specific structure—a castle or a bridge—that reveals the overall pattern and strength of the data.

View File

@@ -1,55 +1,132 @@
\chapter{Data Definition with SQL} \chapter{Data Definition with SQL}
\section{Overview of Data Definition and the SQL Language} \section{Overview of Data Definition}
The process of managing data within an information system begins with a rigorous definition of its structure. The Data Definition Language (DDL) serves as the primary tool for database administrators to specify the logical organization of data, known as the schema. Historically, the evolution of database languages was marked by the development of SEQUEL in the early 1970s at the IBM Almaden Research Center, which was eventually renamed SQL due to trademark concerns. SQL is a declarative, set-based language, meaning it allows users to specify what data they want to retrieve or manipulate without detailing the step-by-step physical procedures. This abstraction significantly enhances the productivity of developers by separating the conceptual model from the physical storage layer.
Data in a relational system is organized into two-dimensional tables called relations. These relations must adhere to fundamental integrity principles to maintain data quality. Relational integrity ensures that every entry in a table follows the structure defined by its attributes. Domain integrity mandates that every attribute value belongs to a specific, predefined set of acceptable values. Finally, atomic integrity requires that every component of a tuple is an indivisible unit, preventing the use of complex structures like lists or nested records as simple attribute values. Data definition is the fundamental process of specifying the logical structure of a database, often referred to as the schema. In the context of SQL (Structured Query Language), this involves declaring the tables that will store information, identifying the types of data permitted in each column, and establishing rules to maintain the correctness and consistency of that data. The relational model, which serves as the foundation for modern database systems, represents information as two-dimensional tables called relations. By defining these relations, developers create a rigid framework that ensures data independence, allowing the underlying physical storage to be optimized without affecting the high-level queries used by applications.
\section{Core Domain Types and Atomic Integrity} \section{Mathematical Foundations of Relations}
A central aspect of data definition is the selection of appropriate domain types for each attribute. SQL provides a rich set of standardized types to handle various data categories. Character data is typically managed through fixed-length strings (CHAR) or variable-length strings with a maximum limit (VARCHAR). For exceptionally large textual content, types such as CLOB or TEXT are utilized. Numeric data is bifurcated into exact and approximate types. Exact numbers include various sizes of integers (SMALLINT, INTEGER, BIGINT) and fixed-point decimals (DECIMAL or NUMERIC), where precision and scale can be strictly defined. Approximate numbers, such as REAL and DOUBLE PRECISION, follow floating-point standards to represent scientific data where a degree of approximation is acceptable.
\dfn{Atomic Integrity}{The requirement that every value in a relational table must be a single, indivisible data item of an elementary type, such as an integer or a string.} The concept of a relational table originates from mathematical set theory. At its core, a relation is defined over a series of sets, which are known as attribute domains. While a general relation can represent any subset of a Cartesian product, SQL tables require more specific semantic structures to function effectively as data stores.
Temporal data is equally vital, with SQL supporting types for dates, times, and timestamps. These allow for the storage of specific points in time, often including time zone information for global applications. Furthermore, intervals represent durations, such as "two years and four months." Binary data, such as images or passport scans, is stored using BLOB or BYTEA types. Boolean types provide the foundation for logical operations, supporting TRUE, FALSE, and the three-valued logic involving UNKNOWN when NULL values are present. A collection of data can be viewed as a set of records, where each record acts as a "map" or a function from a set of attributes to values. To transform a simple collection into a formal relational table, three specific types of integrity must be enforced.
\section{Structural Operations: Creating and Modifying Tables} \dfn{Relational Table (Set Semantics)}{A relational table is a set of maps from attribute names to values that satisfies relational integrity, domain integrity, and atomic integrity.}
The lifecycle of a database schema involves the creation, modification, and removal of tables. The \texttt{CREATE TABLE} command is the primary DDL statement used to introduce new relations. It requires the specification of a table name, a list of attributes, and their associated domains. A newly created table is initially empty, representing an extension of zero tuples.
\thm{Relational Schema}{The formal definition of a relation, comprising its unique name and the set of attributes along with their corresponding data types or domains.} \thm{The Three Rules of Relational Integrity}{To qualify as a relational table, a collection must fulfill:
As requirements change, the \texttt{ALTER TABLE} statement allows administrators to evolve the schema without deleting existing data. This includes adding new columns, which may be initialized with NULL or a specific default value, and renaming or removing existing columns. When a table is no longer required, the \texttt{DROP TABLE} command removes both the schema and all stored data from the system. To avoid errors during automated scripts, the \texttt{IF EXISTS} clause is frequently employed to ensure a command only executes if the target relation is present. \begin{enumerate}
\item \textbf{Relational Integrity}: Every record in the collection must have the same support, meaning they all share the exact same set of attributes.
\item \textbf{Domain Integrity}: Each attribute is associated with a specific domain (type), and every value for that attribute must belong to that domain.
\item \textbf{Atomic Integrity}: Every value in the table must be atomic, meaning it cannot be broken down into smaller components (e.g., no tables within tables).
\end{enumerate}}
\section{Data Manipulation and Logic of Modifications} \nt{While mathematics primarily uses set semantics (no duplicates), practical SQL implementations often utilize bag semantics, allowing for duplicate records, or list semantics, where the order of records is preserved.}
Once the schema is defined, Data Manipulation Language (DML) commands are used to populate and maintain the data. The \texttt{INSERT} statement adds new records to a relation. It can take specific values for a single tuple or use a subquery to perform bulk insertions from other tables. A critical rule in SQL modification is that the system must evaluate the query portion of an insertion entirely before any data is actually added to the target table. This prevents infinite loops or inconsistent states where a new tuple might satisfy its own insertion criteria.
\dfn{Cascading Rollback}{A situation where the abortion of one transaction necessitates the cancellation of other dependent transactions that have read data written by the initial transaction.} \section{The Evolution and Nature of SQL}
The \texttt{DELETE} and \texttt{UPDATE} commands provide the means to remove or modify existing tuples based on specific conditions. Similar to insertions, these operations apply the condition to every tuple in the relation and execute the change only on those that satisfy the predicate. Through these commands, the system transitions between different database states while aiming to preserve overall consistency. SQL was developed in the early 1970s at IBMs San Jose research facility, originally under the name SEQUEL (Structured English Query Language). Created by Don Chamberlin and Raymond Boyce, the language was designed to be more intuitive than earlier procedural languages by using English-like syntax.
\section{Integrity Constraints and Key Definitions} The primary characteristic of SQL is that it is a declarative language. Unlike imperative languages such as Java or C++, where the programmer must define exactly how to retrieve or calculate data, a SQL user simply declares what the desired result looks like. The database engine then determines the most efficient way to execute the request.
Constraints are declarative rules that restrict the data permitted in the database to prevent inaccuracies. The most fundamental constraints are those defining keys. A primary key uniquely identifies each row in a table and, by definition, cannot contain NULL values. In contrast, the \texttt{UNIQUE} constraint ensures distinctness but may permit NULLs depending on the specific DBMS implementation.
\thm{The Thomas Write Rule}{A principle in timestamp-based concurrency control that allows a write operation to be ignored if a later transaction has already updated the same data element, thereby maintaining the intended final state.} \thm{Set-Based Processing}{SQL is a set-based language, meaning it manipulates entire relations with a single command rather than processing one record at a time.}
Beyond keys, \texttt{NOT NULL} constraints ensure that critical attributes always have a value. \texttt{CHECK} constraints provide more complex logic, allowing the system to validate that an attribute or an entire tuple meets specific boolean conditions. For instance, a check could ensure that a person's age is never negative or that a start date precedes an end date. \section{SQL Data Types and Domains}
\section{Referential Integrity and Foreign Keys} Every attribute in a SQL table must be assigned a data type. These types define the nature of the data and the operations that can be performed on it.
Referential integrity is maintained through foreign keys, which establish a link between tables. A foreign key in one table must reference a unique or primary key in another. This ensures that the relationship between entities remains valid; for example, every movie in a tracking system must be associated with an existing studio.
\dfn{Foreign Key}{An attribute or set of attributes in a relation that serves as a reference to a primary or unique key in a different relation, enforcing a logical connection between the two.} \subsection{String Types}
The management of these links during data removal or updates is governed by specific policies. The \texttt{CASCADE} policy ensures that changes in the parent table are automatically reflected in the child table. Alternatively, the \texttt{SET NULL} policy breaks the link by nullifying the foreign key when the referenced record is deleted. If neither is appropriate, the \texttt{RESTRICT} policy blocks any modification that would break referential integrity. SQL provides several ways to store text. Fixed-length strings are defined as \texttt{char(n)}, where the system reserves exactly $n$ characters. If the input is shorter, it is padded with spaces. Variable-length strings with a specified limit are defined as \texttt{varchar(n)}. For very long text without a specific limit, PostgreSQL uses the \texttt{text} type, while the SQL standard refers to this as \texttt{clob} (Character Large Object).
\section{Advanced Constraints and Deferred Checking} \subsection{Numeric Types}
For constraints that span multiple tables or require global validation, SQL offers assertions. Unlike table-based checks, assertions are standalone schema elements that the DBMS must verify whenever any involved relation is modified. This makes them powerful but potentially expensive to implement efficiently.
\thm{Two-Phase Locking (2PL)}{A concurrency control protocol that guarantees conflict-serializability by requiring that all lock acquisitions by a transaction must occur before any of its locks are released.} Numbers are categorized into exact and approximate types. Exact numbers include integers (\texttt{smallint}, \texttt{integer}, \texttt{bigint}) and fixed-point decimals.
In complex transactions where multiple interrelated tables are updated, immediate constraint checking can be problematic. SQL addresses this with deferred checking. By declaring a constraint as \texttt{DEFERRABLE}, the system can postpone validation until the very end of a transaction, just before it commits. This allows for temporary inconsistencies that are resolved by the time the transaction completes its entire sequence of actions. \dfn{Fixed-Point Decimal}{A numeric type defined by \texttt{decimal(p, s)}, where $p$ is the total number of significant digits (precision) and $s$ is the number of digits after the decimal point (scale).}
\section{Active Database Elements: Triggers} Approximate numbers are represented as floating-point values using \texttt{real} (single precision) or \texttt{double precision}. These follow the IEEE 754 standard and are highly efficient because they are handled directly by computer hardware.
Triggers, or Event-Condition-Action (ECA) rules, represent the transition from a passive database to an active one. A trigger is awakened by a specific event—such as an insertion, deletion, or update—and then evaluates a condition. If the condition is true, the system executes a predefined set of actions.
\dfn{Trigger}{A stored procedure that is automatically invoked by the DBMS in response to specified changes to the database, consisting of a triggering event, a condition, and a resulting action.} \subsection{Temporal and Binary Types}
Triggers offer significant flexibility compared to standard constraints. They can be set to execute either \texttt{BEFORE} or \texttt{AFTER} the triggering event and can operate at either the row level (executing for every modified tuple) or the statement level (executing once for the entire SQL command). They are frequently used to enforce complex business rules, maintain audit logs, or automatically fix data inconsistencies that simple constraints cannot handle. SQL supports complex date and time tracking. The \texttt{date} type follows the Gregorian calendar, while \texttt{time} tracks hours, minutes, and seconds. \texttt{timestamp} combines both, and can optionally include time zone data to handle global information.
\nt{The \texttt{interval} type represents a duration. However, there is a "duration wall" between months and days because the number of days in a month is variable, making certain additions ambiguous.}
Binary data, such as images or videos, is stored using \texttt{binary(p)}, \texttt{varbinary(p)}, or \texttt{blob} (referred to as \texttt{bytea} in PostgreSQL).
\section{Structural Management of Tables}
The Data Definition Language (DDL) subset of SQL provides commands to manage the lifecycle of tables.
\subsection{Creating and Dropping Tables}
The \texttt{CREATE TABLE} statement is used to define a new relation. It requires a unique table name, a list of attributes, and their associated domains. A newly created table is initially empty.
\nt{In SQL, names are generally case-insensitive. However, if a developer needs to force a specific case for an attribute name, they must surround it with double quotes. Single quotes are reserved exclusively for string literals (values).}
To remove a table entirely from the database, the \texttt{DROP TABLE} command is used. If there is uncertainty about whether a table exists, the \texttt{IF EXISTS} clause can be added to prevent execution errors.
\subsection{Modifying Tables}
The \texttt{ALTER TABLE} command allows for changes to an existing table's schema. This includes adding new columns, removing existing ones, or renaming attributes and the table itself.
\thm{Adding Columns to Populated Tables}{When a new column is added to a table that already contains data, the system must fill the new attribute for existing rows. By default, it uses \texttt{NULL}, but a specific \texttt{DEFAULT} value can be specified instead.}
\section{Data Population and Manipulation}
While data modification is primarily part of the Data Manipulation Language (DML), it is closely tied to definition through constraints.
\subsection{Insertion Strategies}
The most basic way to populate a table is the \texttt{INSERT INTO} statement followed by \texttt{VALUES}. One can insert a single record or multiple records in one command. If certain columns are omitted, the system will attempt to fill them with \texttt{NULL} or a defined default value.
\thm{Insertion via Subqueries}{Instead of providing explicit values, an \texttt{INSERT} statement can use a \texttt{SELECT} subquery to compute a set of tuples from other tables and insert them into the target relation.}
\subsection{Updates and Deletions}
Data can be modified using \texttt{UPDATE}, which changes values in existing tuples based on a condition, or removed using \texttt{DELETE FROM}, which deletes specific rows while keeping the table's structure intact.
\section{Consistency and Integrity Constraints}
Constraints are rules used to prevent the entry of invalid data, effectively enforcing relational and domain integrity at the database level.
\dfn{NULL Value}{A special marker used in SQL to indicate that a data value is unknown, inapplicable, or kept secret. It is not equivalent to zero or an empty string.}
\subsection{Fundamental Constraints}
\begin{itemize}
\item \textbf{NOT NULL}: This ensures that a column cannot have an empty or unknown value. This is a primary tool for pushing a database toward strict relational integrity.
\item \textbf{UNIQUE}: This requires that every non-null value in a column be distinct. It can be applied to a single column or a combination of columns (table constraint).
\item \textbf{CHECK}: This allows for arbitrary conditions that every row must satisfy, such as ensuring a price is positive or a date is within a valid range.
\end{itemize}
\section{Primary and Foreign Keys}
Keys are the most critical constraints in relational design as they define how records are identified and linked.
\subsection{Primary Keys}
A primary key is an attribute or set of attributes that uniquely identifies a row. By definition, a primary key must be \texttt{UNIQUE} and \texttt{NOT NULL}. Every table should ideally have one primary key to ensure each record can be referenced without ambiguity.
\subsection{Foreign Keys and Referential Integrity}
A foreign key is an attribute in one table that references a unique or primary key in another table. This creates a link between the two relations.
\thm{Referential Integrity}{This constraint ensures that every value in a foreign key column must either be \texttt{NULL} or exist in the referenced primary key column of the related table.}
\subsection{Handling Deletions in References}
If a referenced value is deleted, the system must follow a specific policy to maintain integrity.
\nt{Common policies for handling the deletion of a referenced record include:
\begin{itemize}
\item \textbf{CASCADE}: Automatically delete or update the referencing rows.
\item \textbf{RESTRICT/NO ACTION}: Prohibit the deletion if references exist.
\item \textbf{SET NULL}: Reset the foreign key of the referencing rows to \texttt{NULL}.
\item \textbf{SET DEFAULT}: Reset the foreign key to its default value.
\end{itemize}}
\section{Lexical vs. Value Space}
A sophisticated concept in data definition is the distinction between how data is represented and what it actually is. The "Value Space" refers to the abstract mathematical object (e.g., the concept of the number four), while the "Lexical Space" refers to the various ways that value can be written in a query (e.g., '4', '4.0', '04', or '4e0'). SQL engines are responsible for mapping various lexical representations to the correct underlying value space to perform comparisons and arithmetic accurately.

View File

@@ -1,50 +1,121 @@
\chapter{Database Architecture} \chapter{Database Architecture}
Database architecture serves as the structural foundation that bridges high-level data models with the physical realities of computer hardware. It encompasses a spectrum of components ranging from the multi-layered memory hierarchy used to store bits and bytes to the logical abstractions like virtual views that provide tailored interfaces for users. A central goal of this architecture is to maintain data independence, allowing the underlying storage methods to change without affecting how application programs interact with the data. This summary explores the mechanics of physical storage, the implementation of virtual relations, and the optimization techniques involving index structures to ensure efficient data retrieval and system resilience. The management of transactions is the core mechanism that ensures a database remains reliable and consistent despite concurrent access and system failures. A transaction is defined as a logical unit of work, consisting of one or more database operations that must be executed as an indivisible whole. This chapter explores the multi-tier architecture that supports these operations, the physical storage layer that provides data independence, and the sophisticated logging and concurrency control protocols used to maintain the ACID properties. We investigate how the system handles crashes through undo and redo logging, how schedulers prevent interference between users through locking and timestamping, and how complex, long-running processes are managed through the use of sagas and compensating transactions.
\section{The Storage and Memory Hierarchy} \section{The Architectural Context of Transactions}
Behind the daily operations of a database system lies a sophisticated hardware hierarchy designed to manage the trade-off between access speed and storage capacity. At the fastest and most expensive level is the processor's internal cache (Levels 1 and 2), providing almost instantaneous access to data measured in nanoseconds. Below this is the main memory, or RAM, which acts as the primary workspace for the Database Management System (DBMS). While RAM provides significant capacity, it is volatile, meaning its contents are lost if power is interrupted. This volatility is a critical consideration for the ACID properties of transactions, specifically durability.
Secondary storage, primarily consisting of magnetic disks, serves as the non-volatile repository where data persists. Accessing disks involves mechanical movements, introducing latencies in the millisecond range, which is orders of magnitude slower than RAM. For massive data sets that exceed disk capacity, tertiary storage such as tapes or DVDs is utilized, offering enormous capacity (terabyte to petabyte range) at the cost of significantly longer access times. Modern database systems are typically deployed in a three-tier architecture to separate user interaction from business logic and data persistence.
\dfn{Memory Hierarchy}{A systematic organization of storage devices in a computer system, ranked by speed, capacity, and cost per bit, typically including cache, main memory, and secondary/tertiary storage.} \dfn{Three-Tier Architecture}{A system organization consisting of three distinct layers: the Web Server tier for managing client interactions, the Application Server tier for executing business logic and generating queries, and the Database Server tier for managing data storage and transaction execution.}
\thm{The Dominance of I/O}{The performance of a database system is largely determined by the number of disk I/O operations performed, as the time required to access a block on disk is significantly greater than the time needed to process data in main memory.} The database tier is designed to provide data independence, allowing users to query data without needing to understand the underlying physical storage mechanics. Behind the scenes, the DBMS manages a complex hierarchy of hardware, moving data between volatile main memory (RAM) and nonvolatile storage (Disk).
\section{Physical Data Representation and PGDATA} \nt{Data independence is a fundamental principle established by Edgar Codd. it ensures that the logical representation of data in tables is decoupled from the physical directories and files on the disk, such as the `pg\_data` directory in PostgreSQL.}
The physical storage of a database is typically organized into a specific directory structure on the host machine, often referred to as the data directory or PGDATA. This directory contains the actual files representing tables, configuration parameters, and transaction logs. To manage this data effectively, the DBMS divides information into blocks or pages, which are the fundamental units of transfer between disk and memory. In systems like PostgreSQL, these pages are usually 8KB in size.
Data within these pages is organized into records. Fixed-length records are straightforward to manage, but variable-length fields require more complex structures like offset tables within the block header. When fields become exceptionally large, such as multi-gigabyte video files or documents, techniques like TOAST (The Oversized-Attribute Storage Technique) are employed to store these values in separate chunks, preventing them from bloating the primary data pages. \section{Physical Storage and Data Movement}
\section{Virtual Views and Data Abstraction} The unit of interaction between the disk and main memory is not the individual record, but the block or page. In many systems, such as PostgreSQL, these blocks are typically 8 KB in size.
Virtual views are relations that do not exist physically in the database but are instead defined by a stored query over base tables. These views provide a layer of abstraction, allowing different users to see the data in formats that suit their specific needs without duplicating the underlying information. When a user queries a view, the system's query processor substitutes the view name with its corresponding definition, effectively transforming the query into one that operates directly on the stored base tables.
\dfn{Virtual View}{A relation that is not stored in the database but is defined by an expression that constructs it from other relations whenever it is needed.} \dfn{Database Element}{A unit of data that can be accessed or modified by a transaction. While elements can be tuples or relations, they are most effectively treated as disk blocks to ensure atomic writes to nonvolatile storage.}
Attributes within a view can be renamed for clarity using the AS keyword or by listing names in the declaration. This allows the architect to present a clean logical model to the end-user while hiding the complexity of the underlying join operations or attribute names. \thm{The I/O Model of Computation}{The primary cost of database operations is measured by the number of disk I/O actions. Because accessing a disk is orders of magnitude slower than CPU cycles, efficiency is achieved by minimizing the transfer of blocks between the disk and memory buffers.}
\section{Modification of Virtual Relations} When a record is too large to fit within a standard page—such as large text objects or binary data—the system employs specialized techniques like TOAST (The Oversized-Attribute Storage Technique), which slices the data into chunks and stores them in separate tables. Physically, these pages are organized into larger files on the disk called chunks, often reaching sizes of 1 GB.
Modifying a view is inherently more complex than modifying a base table because the system must determine how to map changes to the underlying physical data. SQL allows for "updatable views" under specific conditions: the view must be defined over a single relation, it cannot use aggregation or duplicate elimination, and the selection criteria must be simple enough that the system can unambiguously identify which base tuples are affected.
\dfn{Updatable View}{A virtual view that is sufficiently simple to allow insertions, deletions, and updates to be translated directly into equivalent modifications on the underlying base relation.} \section{The ACID Properties of Transactions}
For more complex views that involve multiple tables or aggregations, "instead-of" triggers provide a solution. These triggers intercept a modification attempt on a view and execute a custom piece of logic—written by the database designer—that appropriately updates the base tables. To ensure the integrity of the database, every transaction must satisfy the ACID test. These properties guarantee that the database remains in a consistent state even if a program is interrupted or multiple users attempt to modify the same record.
\section{Index Structures and Motivation} \thm{ACID Properties}{
As database relations grow, scanning every block to find specific information becomes prohibitively slow. Indexes are specialized data structures designed to accelerate this process. An index takes the value of a specific attribute, known as the search key, and provides pointers directly to the records containing that value. \begin{itemize}
\item \textbf{Atomicity:} The "all-or-nothing" execution of transactions. If any part fails, the entire unit is rolled back.
\item \textbf{Consistency:} Every transaction must move the database from one valid state to another, satisfying all structural and business constraints.
\item \textbf{Isolation:} Each transaction must appear to execute as if no other transaction were occurring simultaneously.
\item \textbf{Durability:} Once a transaction is committed, its effects must persist permanently, surviving any subsequent system crash.
\end{itemize}}
\dfn{Index}{A stored data structure that facilitates the efficient retrieval of records in a relation based on the values of one or more attributes.} \section{Undo Logging and Recovery}
The primary motivation for indexing is the reduction of disk I/O. For example, finding a specific movie in a massive relation is much faster if the system can use an index on the title rather than performing a full table scan. In joins, indexes on the join attributes can allow the system to look up only the relevant matching tuples, avoiding the exhaustive pairing of every row from both relations. Logging is the primary method for achieving durability and atomicity. The log is an append-only file that records every important change to the database.
\section{Strategic Selection of Indexes} \dfn{Undo Logging}{A logging method where only the old values of modified data elements are recorded. It is designed to allow the recovery manager to cancel the effects of uncommitted transactions by restoring data to its previous state.}
While indexes speed up queries, they impose a cost: every time a record is inserted, deleted, or updated, the associated indexes must also be modified. This creates a strategic trade-off for the database architect. A clustering index, where the physical order of records matches the index order, is exceptionally efficient for range queries as it minimizes the number of blocks that must be read. Non-clustering indexes are useful for locating individual records but may require many disk accesses if many rows match the search key, as the records might be scattered across different blocks.
\thm{Index Cost-Benefit Analysis}{The decision to create an index depends on the ratio of queries to modifications; an index is beneficial if the time saved during data retrieval exceeds the additional time required to maintain the index during updates.} For undo logging to function correctly, two specific rules must be followed:
1. Every update record (the old value) must be written to the disk before the modified data element itself reaches the disk.
2. The commit record must be written to the disk only after all modified data elements have been successfully flushed to the disk.
Architects often use a cost model based on the number of disk I/O's to evaluate the utility of a proposed index. This model considers factors like the number of tuples (T), the number of blocks (B), and the number of distinct values for an attribute (V). \nt{In undo logging, the order of writes to disk is: Log record $\to$ Data element $\to$ Commit record. This ensures that if a crash occurs before the commit, we always have the old value available to undo the change.}
\section{Historical Foundations of Relational Theory} \section{Redo Logging and the Write-Ahead Rule}
The modern concept of database architecture and data independence traces back to the seminal work of Edgar Codd in 1970. Codd's introduction of the relational model revolutionized the field by suggesting that data should be viewed as sets of tuples in tables, independent of their physical storage. This shift allowed for the development of high-level query languages like SQL and sophisticated optimization techniques that define the current state of database management systems. Subsequent research into integrity checking and semistructured data models like XML continues to build upon these relational foundations.
\dfn{Data Independence}{The principle that the logical structure of data (the schema) should be separated from its physical storage, ensuring that changes to the storage method do not require changes to application programs.} While undo logging requires immediate data flushes, redo logging offers more flexibility by recording only the new values of data elements.
\dfn{Redo Logging}{A logging method that records the new values of database elements. On recovery, the system repeats the changes of committed transactions and ignores those that did not commit.}
\thm{Write-Ahead Logging (WAL) Rule}{In redo logging, all log records pertaining to a modification, including the update record and the commit record, must appear on disk before the modified data element itself is written to disk.}
The order of operations for redo logging is: Log record $\to$ Commit record $\to$ Data element. This allows the system to keep changed data in memory buffers longer, potentially reducing disk I/O, as the log provides a way to "redo" the work if memory is lost.
\section{Undo/Redo Logging and Checkpointing}
A hybrid approach, undo/redo logging, records both the old and new values of a database element ($<T, X, v, w>$). This provides the highest level of flexibility, as the commit record can be written either before or after the data elements are flushed to disk.
\nt{Undo/redo logging is the most common method in modern DBMSs because it allows the buffer manager to be more efficient. It only requires that the log record for a change reach the disk before the change itself does.}
To avoid scanning the entire log during recovery, the system uses checkpointing.
\dfn{Nonquiescent Checkpointing}{A technique that allows the system to mark a "safe" point in the log without shutting down the database. It records the set of active transactions and ensures that all data changed by previously committed transactions has reached the disk.}
\section{Concurrency Control and Serializability}
When multiple transactions run at once, their actions form a schedule. The goal of the scheduler is to ensure that this schedule is serializable.
\dfn{Conflict-Serializable Schedule}{A schedule that can be transformed into a serial schedule (where transactions run one after another) by a sequence of swaps of adjacent, non-conflicting actions.}
\thm{The Precedence Graph Test}{A schedule is conflict-serializable if and only if its precedence graph—where nodes are transactions and arcs represent conflicts—is acyclic. A conflict occurs if two transactions access the same element and at least one is a write.}
\section{Lock-Based Schedulers and Two-Phase Locking}
The most common way to enforce serializability is through the use of locks. Before a transaction can access a database element, it must obtain a lock on that element from the scheduler's lock table.
\dfn{Shared and Exclusive Locks}{A Shared (S) lock allows multiple transactions to read an element simultaneously. An Exclusive (X) lock is required for writing and prevents any other transaction from reading or writing that element.}
Simply taking locks is insufficient; the timing of when locks are released is vital for maintaining a consistent state.
\thm{Two-Phase Locking (2PL)}{A protocol requiring that in every transaction, all lock acquisitions must precede all lock releases. This creates a "growing phase" where locks are gathered and a "shrinking phase" where they are surrendered.}
\nt{Strict Two-Phase Locking is a variation where a transaction holds all its exclusive locks until it commits or aborts. This prevents other transactions from reading "dirty data"—values written by uncommitted transactions—and eliminates the need for cascading rollbacks.}
\section{Deadlock Management}
Locking systems are inherently prone to deadlocks, where transactions are stuck in a cycle of waiting for one another. Schedulers must implement strategies to detect or prevent these states.
\dfn{Waits-For Graph}{A directed graph used for deadlock detection. Nodes represent transactions, and an arc from $T$ to $U$ indicates that $T$ is waiting for a lock currently held by $U$. A cycle in this graph indicates a deadlock.}
Prevention strategies often involve timestamps. Two popular methods are:
\begin{itemize}
\item \textbf{Wait-Die:} If an older transaction needs a lock held by a newer one, it waits. If a newer transaction needs a lock held by an older one, it dies (rolls back).
\item \textbf{Wound-Wait:} An older transaction "wounds" (forces a rollback) a newer transaction that holds a lock it needs. A newer transaction must wait for an older one.
\end{itemize}
\section{Alternative Concurrency Control: Timestamps and Validation}
Beyond locking, systems may use optimistic concurrency control methods, which are particularly effective when conflicts are rare.
\dfn{Timestamp-Based Scheduling}{A method where each transaction is assigned a unique timestamp when it begins. The scheduler maintains read and write times for every database element and rolls back any transaction that attempts to perform a "physically unrealizable" action, such as reading a value written in its future.}
\dfn{Validation-Based Scheduling}{An optimistic approach where transactions execute in a private workspace. Before committing, the transaction enters a validation phase where the scheduler checks its read and write sets against those of other active transactions to ensure no serializability violations occurred.}
\section{Long-Duration Transactions and Sagas}
In environments like design systems or workflow management, transactions can last for hours or even days. Holding locks for such durations would paralyze the system.
\dfn{Saga}{A long-duration transaction consisting of a sequence of smaller, independent actions. Each action is its own transaction that commits immediately.}
\thm{Compensating Transactions}{For every action $A$ in a saga, there must be a corresponding compensating transaction $A^{-1}$ that logically undoes the effects of $A$. If the saga must abort, the system executes the compensating transactions in reverse order to return the database to a consistent state.}
\nt{A saga does not strictly follow the traditional "Isolation" property of ACID, as the results of its intermediate actions are visible to other transactions. However, through the use of compensation, it maintains the logical consistency of the system.}
In conclusion, transaction management requires a delicate balance between performance and correctness. By combining robust logging for durability, strict locking for isolation, and innovative structures like sagas for long-term processes, modern database systems provide a stable foundation for complex information ecosystems. These mechanisms ensure that even in the event of hardware failure or intense concurrent demand, the integrity of the data remains unassailable.
To visualize a transaction, think of it as a set of instructions for a complicated recipe. If you get halfway through and realize you are missing a vital ingredient, you cannot just stop and leave the half-mixed dough on the counter. You must either finish the recipe or clean up the mess so the kitchen is exactly as it was before you started. The database scheduler and log manager are like the head chef, ensuring that every cook has the tools they need and that no one's flour ends up in someone else's soup.

View File

@@ -1,138 +1,139 @@
\chapter{Database Design Theory} \chapter{Database Design Theory}
Relational database design is fundamentally a process of identifying and addressing structural weaknesses within a schema. While a database can be constructed by simply creating tables for every entity, a lack of rigorous design often results in redundancy and anomalies that compromise data integrity. Design theory provides a mathematical framework, primarily through the study of functional dependencies and normal forms, to evaluate and refine these structures. Database design theory provides a mathematical foundation for creating relational schemas that are both efficient and resilient to errors. The primary objective is to avoid anomalies—logical inconsistencies that arise when a schema is poorly structured. These issues usually stem from a single relation attempting to store too many distinct types of information. By applying formal techniques such as functional dependencies and normalization, designers can decompose complex tables into smaller, well-structured ones that preserve data integrity while reducing redundancy. This chapter explores the fundamental concepts of functional dependencies, the definition of various keys, and the criteria for Boyce-Codd Normal Form, concluding with algorithms to ensure that decompositions do not lose information.
The core objective is to ensure that a relation is "about" one specific concept. When a single table attempts to store information concerning multiple distinct entities—such as products, customers, and individual sales transactions all in one place—it inevitably leads to maintenance difficulties. These difficulties are categorized as update, insertion, and deletion anomalies. By applying normalization techniques, designers can decompose a problematic relation into smaller, more focused relations that preserve the original data while eliminating these risks. \section{Anomalies and the Need for Better Design}
This summary explores the foundational elements of this theory, beginning with the formal definition of functional dependencies, the mechanics of attribute closures, the identification of keys, and the application of Boyce-Codd Normal Form (BCNF) to achieve a robust database architecture. We also examine the critical properties of these decompositions, such as the lossless-join property, ensuring that information is never lost when tables are split. When a database schema is designed without adhering to theoretical principles, it often suffers from three major types of anomalies. These anomalies make the database difficult to maintain and prone to data corruption over time.
\dfn{Update Anomaly}{A situation where a piece of information is stored multiple times due to redundancy. If that information changes, every instance must be updated simultaneously. Failure to do so leads to an inconsistent state where different records provide conflicting information for the same logical entity.}
\dfn{Deletion Anomaly}{Occurs when the deletion of certain data inadvertently results in the loss of other, unrelated information that was stored in the same record. For example, if customer information and product pricing are stored in a single table, deleting a customer's only order might result in the loss of all data regarding that product's existence.}
\dfn{Insertion Anomaly}{Arises when it is impossible to store certain information because it requires the presence of other, currently unavailable data. A common example is being unable to record a new product's price because no customer has ordered it yet, or being unable to store a new customer's details until they make their first purchase.}
The intuitive solution to these problems is the separation of functions. Instead of one massive table, information should be distributed across multiple tables, each dedicated to a single functional purpose (e.g., one for customers, one for products, and one for transactions). These tables are then linked through primary and foreign keys.
\section{Functional Dependencies} \section{Functional Dependencies}
A functional dependency is a constraint that describes the relationship between attributes within a relation. It generalizes the concept of a key by stating that if the values of one set of attributes are known, the values of another set are determined. The core mathematical tool in design theory is the functional dependency. It allows us to formalize the relationships between attributes and provides a method to determine if a schema is well-designed.
Definition: \dfn{Functional Dependency}{A functional dependency (FD) on a relation $R$ is an expression of the form $A_1, A_2, \dots, A_n \rightarrow B_1, B_2, \dots, B_m$. This indicates that for any two tuples $t$ and $u$ in $R$, if $t$ and $u$ agree on all components for attributes $A_1, A_2, \dots, A_n$, then they must also agree on all components for attributes $B_1, B_2, \dots, B_m$.} \dfn{Functional Dependency (FD)}{A statement about a relation $R$ such that for any two tuples $t_1$ and $t_2$, if they agree on the values of a set of attributes $S$, they must also agree on the values of another set of attributes $T$. This is denoted as $S \to T$. We say that $S$ functionally determines $T$.}
In simpler terms, we say that the set $\{A_1, A_2, \dots, A_n\}$ functionally determines the set $\{B_1, B_2, \dots, B_m\}$. This is essentially a statement about the "functions" that can exist within a table; for every unique value in the determining set, there is a unique corresponding value in the determined set. It is important to note that a functional dependency is a property of the schema itself and must hold for all possible instances of the relation, not just the current set of data. \thm{The No Coincidences Assumption}{In the study of design theory, we assume that if a functional dependency $S \to T$ holds, it is because of a structural requirement of the application, not an accidental coincidence in the current data. The dependency must hold for all possible valid instances of the database.}
Concept: \thm{The Nature of Functional Dependencies}{A functional dependency $X \rightarrow Y$ exists because there is a logical or physical relationship in the real world being modeled, such that the values in $Y$ are functionally dependent on the values in $X$. This relationship allows the DBMS to ensure that no contradictory data exists within the relation.} Functional dependencies can be visualized as a lookup process: if you know the value of $S$, there is a unique value of $T$ associated with it. However, this is not a mathematical function that can be calculated from first principles; it is a relationship maintained by the database state.
\section{Keys and Superkeys}
Functional dependencies allow us to define precisely what constitutes a key for a relation. A key is a set of attributes that uniquely identifies every tuple in a relation.
Definition: \dfn{Superkey}{A set of attributes $\{A_1, A_2, \dots, A_n\}$ is a superkey for a relation $R$ if those attributes functionally determine all other attributes of the relation. That is, no two distinct tuples in any legal instance of $R$ can have the same values for all attributes in the superkey.}
Definition: \dfn{Key}{A set of attributes $K$ is a key for a relation $R$ if $K$ is a superkey and no proper subset of $K$ is also a superkey. This implies that a key must be minimal; removing even one attribute from a key destroys its ability to uniquely identify tuples.}
Concept: \thm{Existence of Keys}{Every relational table must have at least one candidate key. In the extreme case where no smaller subset exists, the set of all attributes in the relation serves as a superkey.}
When a relation has multiple candidate keys, a designer usually selects one to be the primary key. While the choice of a primary key is often a matter of implementation preference, the underlying theory treats all candidate keys as equally capable of enforcing data integrity.
\section{Rules for Reasoning About Functional Dependencies} \section{Rules for Reasoning About Functional Dependencies}
Often, the set of functional dependencies explicitly provided by a designer implies the existence of other dependencies. Understanding how to derive these implicit constraints is necessary for normalization. We can deduce new functional dependencies from a set of existing ones using logical rules. These rules allow us to simplify sets of dependencies or verify if a specific dependency holds.
\subsection{The Splitting and Combining Rules} \dfn{Trivial Functional Dependency}{An FD $S \to T$ is trivial if $T \subseteq S$. Such a dependency always holds regardless of the data, as it simply states that if you know a set of values, you know any subset of those values.}
Functional dependencies can be simplified or grouped based on their right-hand sides.
\thm{Armstrong's Axioms}{A complete set of rules for inferring all functional dependencies that follow from a given set:
\begin{itemize} \begin{itemize}
\item \textbf{Splitting Rule:} An FD $X \rightarrow YZ$ is equivalent to the pair of FDs $X \rightarrow Y$ and $X \rightarrow Z$. \item \textbf{Reflexivity:} If $T \subseteq S$, then $S \to T$.
\item \textbf{Combining Rule:} A set of FDs $X \rightarrow A_1, X \rightarrow A_2, \dots, X \rightarrow A_n$ can be expressed as a single FD $X \rightarrow A_1 A_2 \dots A_n$. \item \textbf{Augmentation:} If $S \to T$, then $SZ \to TZ$ for any set of attributes $Z$.
\end{itemize} \item \textbf{Transitivity:} If $S \to T$ and $T \to U$, then $S \to U$.
It is crucial to observe that these rules only apply to the right-hand side of a dependency. Splitting the left-hand side is generally invalid; knowing that $\{Title, Year\} \rightarrow Length$ does not imply that $Title \rightarrow Length$. \end{itemize}}
\subsection{Trivial Dependencies}
A dependency is considered trivial if it is guaranteed to hold in any relation, regardless of the data.
Definition: \dfn{Trivial Functional Dependency}{An FD $X \rightarrow Y$ is trivial if the set of attributes $Y$ is a subset of the set of attributes $X$. For example, $Title, Year \rightarrow Title$ is a trivial dependency because any two tuples agreeing on both title and year must necessarily agree on the title.}
\subsection{Transitivity}
Concept: \thm{Transitive Rule}{If a relation satisfies the functional dependencies $X \rightarrow Y$ and $Y \rightarrow Z$, then it must also satisfy the functional dependency $X \rightarrow Z$.}
\section{Attribute Closure}
To determine if a specific functional dependency $X \rightarrow A$ follows from a given set of dependencies $S$, we compute the closure of the set of attributes $X$.
Definition: \dfn{Attribute Closure}{The closure of a set of attributes $X$ with respect to a set of functional dependencies $S$, denoted $X^+$, is the set of all attributes $A$ such that $X \rightarrow A$ can be derived from $S$ using the rules of functional dependencies.}
Concept: \thm{Closure Algorithm}{To compute $X^+$:
1. Start with a set of attributes $Y = X$.
2. Repeatedly find an FD $B \rightarrow C$ in $S$ such that $B$ is a subset of $Y$, but $C$ is not. Add $C$ to $Y$.
3. When no more attributes can be added, the resulting set $Y$ is $X^+$.}
This algorithm is essential for finding keys. If $X^+$ contains all attributes of a relation $R$, then $X$ is a superkey for $R$. If no subset of $X$ has a closure containing all attributes, then $X$ is a key.
\section{Design Anomalies}
Redundancy in a database is not merely a waste of storage space; it is the root cause of data inconsistency through three types of anomalies.
In addition to the axioms, two practical rules are frequently used to manipulate dependencies:
\begin{itemize} \begin{itemize}
\item \textbf{Redundancy:} Storing the same piece of information multiple times. For example, if we store the studio address in every movie record, a studio with many movies will have its address repeated many times. \item \textbf{Splitting Rule:} $A \to BC$ is equivalent to $A \to B$ and $A \to C$.
\item \textbf{Update Anomaly:} This occurs when a change to one instance of redundant data is not reflected in all other instances. If a studio moves, every single movie record associated with that studio must be updated, or the database will hold contradictory addresses. \item \textbf{Combining Rule:} $A \to B$ and $A \to C$ is equivalent to $A \to BC$.
\item \textbf{Deletion Anomaly:} This happens when the deletion of a tuple results in the unintended loss of unrelated information. If we delete the only movie associated with a studio, we might lose the information about that studio's existence and address entirely.
\item \textbf{Insertion Anomaly:} This occurs when we cannot insert information about one entity without having information about another. For instance, we might not be able to store a new studio's address until they produce their first movie.
\end{itemize} \end{itemize}
Note that these rules only apply to the right-hand side of a dependency. One cannot split the left-hand side (e.g., $AB \to C$ does not imply $A \to C$).
The goal of database design theory is to eliminate these anomalies by ensuring that all functional dependencies are "cleanly" associated with keys. \section{Attribute Closure and Minimal Basis}
To determine everything that a set of attributes $X$ can determine, we calculate its closure.
\dfn{Attribute Closure}{The closure of a set of attributes $X$ under a set of FDs $F$, denoted $X^+$, is the set of all attributes $A$ such that $X \to A$ can be derived from $F$. The algorithm starts with $X$ and repeatedly adds the right-hand side of any FD whose left-hand side is already contained within the current set.}
\nt{The closure algorithm is essential for finding keys. If $X^+$ contains all attributes of the relation, then $X$ is a superkey.}
Sets of FDs can be redundant. To streamline the design process, we often look for a minimal basis.
\dfn{Minimal Basis}{A set of FDs $B$ is a minimal basis for a set $F$ if:
\begin{itemize}
\item $B$ is equivalent to $F$.
\item Every right-hand side in $B$ is a single attribute.
\item No FD in $B$ can be removed without losing equivalence.
\item No attribute can be removed from the left-hand side of an FD in $B$ without losing equivalence.
\end{itemize}}
\section{Defining Keys in Relational Design}
Keys are subsets of attributes that uniquely identify records. Understanding the hierarchy of keys is necessary for normalization.
\dfn{Superkey}{A set of attributes $K$ is a superkey for relation $R$ if $K$ functionally determines all other attributes in $R$. Every relation has at least one superkey: the set of all its attributes.}
\dfn{Candidate Key}{A candidate key is a minimal superkey. This means that if any attribute is removed from the set, it is no longer a superkey.}
\dfn{Primary Key}{A specific candidate key chosen by the database designer to be the principal means of identifying tuples within a relation.}
\nt{An attribute is considered \textbf{prime} if it is a member of at least one candidate key. Otherwise, it is \textbf{non-prime}.}
\section{Boyce-Codd Normal Form (BCNF)} \section{Boyce-Codd Normal Form (BCNF)}
BCNF is a strict standard for relation design that effectively eliminates the anomalies caused by functional dependencies. The first level of normalization is often considered the First Normal Form (1NF), which requires that every attribute value be atomic (no nested tables or arrays). However, to eliminate the anomalies discussed earlier, we require stricter forms like BCNF.
Definition: \dfn{Boyce-Codd Normal Form}{A relation $R$ is in BCNF if and only if for every non-trivial functional dependency $X \rightarrow Y$ that holds in $R$, $X$ is a superkey of $R$.} \dfn{Boyce-Codd Normal Form (BCNF)}{A relation $R$ is in BCNF if and only if for every non-trivial functional dependency $S \to T$ that holds in $R$, $S$ is a superkey of $R$.}
In a BCNF relation, every determinant (the left side of an FD) must be a superkey. This ensures that every functional relationship in the table is a relationship from a key to some other attribute, preventing the "storing of two things in one table" that leads to redundancy. \thm{Two-Attribute BCNF}{Any relation with exactly two attributes is guaranteed to be in BCNF, regardless of the dependencies present.}
Concept: \thm{The BCNF Decomposition Algorithm}{If a relation $R$ is not in BCNF because of a violating FD $X \rightarrow Y$: If a relation violates BCNF, it means there is a "determinant" (a left-hand side of an FD) that does not uniquely identify the entire row. This causes redundancy and the risk of anomalies.
1. Compute the closure $X^+$.
2. Decompose $R$ into two relations: $R_1$ with attributes $X^+$ and $R_2$ with attributes $X$ and all attributes of $R$ that are not in $X^+$.
3. Recursively apply this process to $R_1$ and $R_2$ until all resulting relations are in BCNF.}
This algorithm ensures that the data is split such that each new relation focuses on a single functional theme. \section{The BCNF Decomposition Algorithm}
\section{Decomposition and the Lossless-Join Property} To bring a relation $R$ into BCNF, we follow a recursive decomposition process:
\begin{enumerate}
\item Identify a non-trivial FD $S \to T$ that violates BCNF (where $S$ is not a superkey).
\item Compute $S^+$.
\item Decompose $R$ into two relations:
\begin{itemize}
\item $R_1$ with attributes in $S^+$.
\item $R_2$ with attributes $S$ and all attributes of $R$ that are not in $S^+$.
\end{itemize}
\item Recursively apply the algorithm to $R_1$ and $R_2$ until all resulting tables are in BCNF.
\end{enumerate}
When we decompose a relation into two or more smaller relations, we must ensure that the process is reversible. If we join the smaller relations back together, we must obtain exactly the original relation—neither more nor less. \section{Lossless Join and the Chase Algorithm}
Definition: \dfn{Lossless Join}{A decomposition of a relation $R$ into $R_1$ and $R_2$ is a lossless join if the natural join of $R_1$ and $R_2$ is always equal to $R$. If the join produces "extra" tuples that were not in the original relation, the join is "lossy," and the decomposition is invalid.} A critical requirement of decomposition is that it must be "lossless." We must be able to reconstruct the original relation exactly by joining the decomposed relations.
Concept: \thm{Condition for a Lossless Join}{A decomposition of $R$ into $R_1$ and $R_2$ is lossless if and only if the set of attributes common to $R_1$ and $R_2$ is a superkey for at least one of the two relations.} \thm{The Lossless Join Property}{A decomposition of $R$ into $R_1, R_2, \dots, R_k$ is lossless if the natural join of all $R_i$ produces exactly the original instance of $R$. A decomposition can never result in fewer tuples than the original, but a "lossy" join creates "ghost tuples"—records that weren't in the original data.}
The BCNF decomposition algorithm is guaranteed to be lossless because it always splits a relation based on a functional dependency $X \rightarrow Y$, ensuring that the common attribute $X$ is a key for the relation containing $X$ and its determined attributes. To verify if a join is lossless, we use the Chase Algorithm.
\section{The Chase Algorithm} \dfn{Chase Algorithm}{A test for a lossless join. We create a tableau representing a tuple in the join. We then apply the functional dependencies of the original relation to equate symbols in the tableau. If one row eventually becomes identical to the target tuple (all unsubscripted symbols), the join is proven to be lossless.}
The "chase" is a general-purpose method to test whether a decomposition into any number of relations is lossless. It uses a tableau of rows and columns to track how functional dependencies can be used to equate different components of tuples. \section{Dependency Preservation and the Impossibility Triangle}
Concept: \thm{The Chase Procedure}{To test if a decomposition $\{S_1, S_2, \dots, S_k\}$ of relation $R$ is lossless: While BCNF eliminates redundancy and ensures a lossless join, it does not always preserve functional dependencies. A dependency $S \to T$ is preserved if it can be checked within a single relation of the decomposition.
1. Create a tableau where each row corresponds to a relation $S_i$ in the decomposition.
2. For each attribute $A$ in $R$, if $A$ is in $S_i$, the entry in row $i$, column $A$ is a special symbol $a$. Otherwise, it is a unique subscripted symbol $b_i$.
3. Repeatedly apply the given FDs to the tableau. If an FD $X \rightarrow Y$ exists and two rows agree on all attributes in $X$, then they must agree on all attributes in $Y$.
4. If a row ever becomes all $a$ symbols, the decomposition is lossless. Otherwise, if no more changes can be made and no such row exists, it is lossy.}
The chase algorithm provides a rigorous check for recoverability, even in complex multi-table designs. \nt{Database designers often face a trade-off. It is not always possible to achieve BCNF, a lossless join, and dependency preservation simultaneously. If preserving a specific dependency is vital for the application, the designer might settle for Third Normal Form (3NF).}
\section{Dependency Preservation} \dfn{Third Normal Form (3NF)}{A relation $R$ is in 3NF if for every non-trivial FD $S \to T$, either $S$ is a superkey or every attribute in $T \setminus S$ is prime (part of some candidate key). 3NF is more lenient than BCNF because it allows certain dependencies where the left side is not a superkey, provided the right side is prime.}
Another desirable property of a decomposition is dependency preservation. A decomposition preserves dependencies if we can check all the original functional dependencies by looking at the decomposed relations individually. \section{Multivalued Dependencies and 4NF}
While the BCNF decomposition is always lossless, it does not always preserve all dependencies. In some cases, a dependency $X \rightarrow Y$ might involve attributes that end up in different relations after the split. If it is impossible to find a BCNF decomposition that preserves all dependencies, a designer might opt for a less strict normal form, such as Third Normal Form (3NF). Sometimes, BCNF is insufficient to remove all redundancy. This occurs when a table attempts to store two independent many-to-many relationships for the same key.
Concept: \thm{The Trade-off in Normalization}{There is a fundamental trade-off in database design: we can always achieve a BCNF decomposition that is lossless, but we might not be able to preserve all dependencies. Conversely, 3NF synthesis can always provide a lossless and dependency-preserving decomposition, though it may allow some minor redundancy.} \dfn{Multivalued Dependency (MVD)}{A statement $S \to\to T$ which implies that for a given value of $S$, the associated values of $T$ are independent of the values of the other attributes in the relation. If we fix $S$, we must see all possible combinations of $T$ and the other attributes.}
\section{Minimal Basis of Functional Dependencies} \thm{Fourth Normal Form (4NF)}{A relation $R$ is in 4NF if for every non-trivial MVD $S \to\to T$, $S$ is a superkey. This is a generalization of BCNF that also accounts for redundancies caused by MVDs.}
Before performing normalization or synthesis, it is often helpful to simplify the set of functional dependencies into a minimal basis (also called a minimal cover). Decomposition into 4NF follows a similar logic to BCNF decomposition but uses MVDs to split the tables. By reaching 4NF, the designer eliminates nearly all forms of logical redundancy.
Definition: \dfn{Minimal Basis}{A set of FDs $B$ is a minimal basis for a set of FDs $S$ if: \section{Design Heuristics and Practical Application}
1. $B$ is equivalent to $S$.
2. All FDs in $B$ have a single attribute on the right-hand side.
3. No FD can be removed from $B$ while maintaining equivalence.
4. No attribute can be removed from the left-hand side of any FD in $B$ while maintaining equivalence.}
Starting with a minimal basis prevents the creation of redundant tables during the 3NF synthesis process and clarifies the essential constraints acting upon the data. The theoretical process often involves a "representative" table—a snapshot of the data that includes enough examples to prevent the designer from assuming "accidental" dependencies. Designers must distinguish between structural dependencies that must always hold and coincidences in a specific dataset.
\section{Conclusion} \thm{The Table Universe Concept}{A table universe consists of all possible valid instances of a schema over time. Integrity constraints and normal forms must apply to every instance in this universe, ensuring that the database remains robust as it grows and changes.}
Database design theory transforms the intuitive process of organizing data into a rigorous engineering discipline. By defining functional dependencies, we uncover the hidden structure of data. By identifying keys and superkeys, we establish the requirements for unique identification. Through attribute closure, we derive the full implications of our constraints. By ensuring that every table does "one thing" and follows the rules of BCNF or 3NF, developers can build systems that are significantly cheaper to maintain, as the logic for handling anomalies does not need to be hard-coded into the application layer. Instead, the structure of the database itself prevents corruption.
The application of BCNF and the decomposition algorithm allows us to restructure relations to eliminate the threat of update, insertion, and deletion anomalies. Finally, by verifying the lossless-join property via the chase algorithm, we ensure that our specialized, efficient schema remains a faithful representation of the original information. While compromises between BCNF and dependency preservation may sometimes be necessary, the tools provided by design theory allow these decisions to be made with a clear understanding of the costs and benefits to data integrity and system performance. \nt{Normal forms are additive: 4NF implies BCNF, which implies 3NF, which implies 2NF, which implies 1NF. The higher the normal form, the fewer anomalies exist in the schema.}
In conclusion, database design theory moves database creation from an intuitive art to a rigorous science. By understanding the interplay between keys, dependencies, and normalization, engineers can create high-performance data systems that remain consistent through complex transactional workloads.

View File

@@ -1,103 +1,72 @@
\chapter{Introduction} \chapter{Introduction}
Modern technological infrastructure relies heavily on the ability to manage vast quantities of facts effectively. At its most fundamental level, we distinguish between data—which represents raw, stored facts—and information, which is data endowed with specific meaning. When this information is applied to solve meaningful problems or used in decision-making processes, it matures into knowledge. An information system is essentially a collection of software programs designed to manage this progression of information efficiently. Modern engineering increasingly relies on the structured management of information, treating data as the fundamental digital substance equivalent to physical matter. In the pursuit of understanding the world, we can categorize scientific inquiry into a matrix of paradigms. While mathematics explores necessary truths through natural thought and computer science analyzes the theoretical necessity of artificial computation, physics observes the world as it exists. Data science effectively acts as the "physics of computer science," utilizing machine-driven computation to observe and interpret the world through empirical evidence.
In the broader context of science, data management has shifted paradigms. While classical mathematics and physics focused on the world as it must be or as it is observed naturally, computer science and data science focus on computing and data-driven insights. Data can be viewed as the "matter" of the digital world, and studying its behavior and storage is central to modern engineering. The objective of an information system is to transform raw observations into actionable intelligence. This process follows a strict hierarchy. Data consists of raw, uninterpreted facts that are stored and moved between systems. When these facts are associated with specific meanings, they become information. Finally, when this information is applied to meaningful tasks or decision-making, it evolves into knowledge.
\dfn{Information System}{ \dfn{Information System}{A software program or a synchronized set of programs designed to manage, store, and provide efficient access to information.}
A software program or a complex suite of programs dedicated to the management, storage, and exchange of information.
}
\thm{Data-Information-Knowledge Hierarchy}{ \thm{The Knowledge Hierarchy}{The structured progression from raw data to information through added meaning, culminating in knowledge through practical application.}
The progression from raw facts (data) to interpreted meaning (information) and finally to the purposeful application of that information (knowledge).
}
\section{Historical Context and the Evolution of Storage} \nt{In modern engineering, making superior decisions is no longer just about observing numbers but about leveraging knowledge derived through information systems.}
The necessity of recording information spans human history, moving from oral traditions to the invention of writing, accounting, and eventually the printing press. However, the mid-20th century marked the beginning of the computational era. In the 1960s, data was primarily managed through file systems. These systems were rudimentary, as they essentially involved independent programs reading from local disks, often leading to data redundancy and inconsistency across different applications. \section{The Historical Evolution of Data Management}
The 1970s ushered in the Relational Era, largely defined by the work of Edgar Codd. He proposed a model where data is organized into tables (relations), allowing users to interact with data logically rather than worrying about its physical placement on a disk. The 1980s saw the rise of object-oriented models, and the 2000s introduced the NoSQL era, which addressed the needs of massive, distributed data through key-value stores, document stores, and graph databases. The history of data management is a narrative of scaling human memory and communication. Before the advent of technology, information was transmitted via oral traditions, which were hindered by the limitations of human recall and distance. The invention of writing marked the first major turning point, allowing symbols to be preserved on durable media such as stone or clay.
\dfn{Relational Database}{ Ancient civilizations intuitively adopted the tabular format for data. Clay tablets from thousands of years ago have been discovered containing relational data, such as Pythagorean triples, organized in rows and columns. This indicates that tables are a primary cognitive tool for human information organization. The invention of the printing press in the 16th century further enabled the mass distribution of data, leading eventually to the mechanical and electronic computing revolutions of the 20th century.
An organized collection of related data presented to the user as a set of two-dimensional tables called relations.
}
\section{The Purpose and Functionality of a DBMS} In the early decades of computing, specifically the 1960s, data management was handled through direct file systems. Programmers were required to know the physical location of data on a disk and write complex logic to retrieve it. This changed in 1970 when Edgar Codd introduced the relational model. He argued that users should interact with data through intuitive tables, while the underlying machine complexities remained hidden. This principle of data independence paved the way for the Object Era in the 1980s and the NoSQL Era in the 2000s, the latter of which was driven by the massive scale of modern social networks and search engines.
A Database Management System (DBMS) is a specialized software suite designed to manage and query databases. Relying on simple file systems for complex applications is problematic because it is difficult to combine data from different files, and there is no built-in support for multiple users or protection against data loss. A DBMS provides five critical functionalities to solve these issues. \nt{The tabular format has remained the most intuitive and enduring method for humans to represent structured data, from ancient clay to modern SQL.}
First, it allows for the creation of new databases and the definition of their schemas, or logical structures. Second, it enables efficient querying and modification of data through specialized languages. Third, it supports the storage of immense datasets—reaching into the terabytes and petabytes—over long periods. Fourth, it ensures durability, meaning the system can recover from failures or errors without losing data. Finally, it manages concurrent access, allowing many users to interact with the data simultaneously without causing inconsistencies. \section{The Structure and Shapes of Data}
\dfn{Database Management System (DBMS)}{ Data is categorized based on its degree of organization. Unstructured data, such as natural language text, audio, images, and video, exists in a raw form that was historically difficult for computers to process. However, recent breakthroughs in linear algebra and vector-based mathematics have enabled modern systems to interpret and even generate this type of content.
A powerful tool for creating, managing, and efficiently querying large amounts of persistent, safe data.
}
\thm{Data Independence}{ Structured data is the highly organized information typically found in spreadsheets and relational databases. Between these lies semi-structured data, which uses tags (like XML or JSON) to provide some semantic context without the rigid requirements of a fixed schema. To manage these types, engineers utilize data models—mathematical notations for describing data structures, the operations allowed on them, and the constraints they must follow.
The principle, championed by Edgar Codd, that separates the physical storage of data from its logical representation, allowing users to interact with a logical model that the software then translates into physical structures.
}
\section{Data Classification and the Three Vs} \dfn{Data Model}{A formal notation that describes the structure of data, the methods for querying and modifying it, and the rules that maintain its integrity.}
Data within an information system typically takes one of three shapes. Structured data, such as that found in relational databases or spreadsheets, follows a rigid schema. Semi-structured data, including XML, JSON, and YAML, possesses some internal structure but is more flexible and can be validated against frames or schemas. Unstructured data, such as raw text, audio, images, or video, lacks a predefined format and often requires advanced linear algebra and vector-based processing to analyze. \thm{The Three Vs of Big Data}{The defining challenges of modern data management are Volume (the sheer amount of bytes), Variety (the diversity of data types), and Velocity (the speed at which data is generated and must be processed).}
The scale of modern data is often described by the "Three Vs": Volume (the sheer amount of data, moving from terabytes to zettabytes), Variety (the different formats and sources of data), and Velocity (the speed at which new data is generated and must be processed). Understanding the prefixes of the International System of Units, such as Peta (10 to the 15th) and Exa (10 to the 18th), is essential for engineers working at this scale. \section{The Necessity of Database Management Systems}
\dfn{Structured Data}{ In primitive computing environments, applications directly accessed files on local disks. This approach resulted in severe problems as systems grew. Data was often redundant (stored in multiple places) and inconsistent (versions of the same data conflicting). It was also difficult to combine data from different sources or control who had access to specific information.
Data that is organized into a highly formatted structure, typically using the relational model, which makes it easily searchable via languages like SQL.
}
\section{System Architecture and the Three-Tier Model} A Database Management System (DBMS) resolves these issues by serving as a central software layer. A robust DBMS is expected to fulfill five primary roles:
\begin{enumerate}
\item Allow users to define the structure (schema) of new databases.
\item Provide high-level languages for querying and changing data.
\item Facilitate the storage of massive datasets over long durations.
\item Ensure durability by recovering data after system failures.
\item Manage concurrent access by multiple users to prevent data corruption.
\end{enumerate}
Modern information systems are often organized into a three-tier architecture to separate concerns and improve scalability. The top layer is the user interface or front-end, which manages the presentation and user interaction. The middle layer is the business logic, where the specific rules and processes of the application are defined. The bottom layer is the database system, which handles data persistence and management. \dfn{Database Management System (DBMS)}{A specialized software suite used to create, manage, and query databases, shielding the user from physical storage details.}
Within this architecture, the DBMS itself is divided into various components. A storage manager controls how data is placed on disk and moved between the disk and main memory. The query processor parses and optimizes requests to find the most efficient execution plan. The transaction manager ensures that database operations are performed safely and reliably. \nt{A "Database System" is the holistic term for the combination of the DBMS software and the actual data stored within it.}
\dfn{3-Tier Architecture}{ \section{System Architecture and Data Independence}
A software design pattern consisting of three layers: the presentation layer (User Interface), the logic layer (Business Logic), and the data layer (Database System).
}
\section{Database Languages: DDL and DML} Most modern information systems utilize a three-tier architecture to ensure modularity and scalability. The top layer is the User Interface (UI), which handles human interaction. The middle layer is the Business Logic, where the rules of the application are processed. The bottom layer is the Persistence layer, where the DBMS manages data storage on a disk or in the cloud.
Interaction with a DBMS occurs through two primary types of languages. The Data Definition Language (DDL) is used to establish and modify the metadata, which is the "data about data" describing the schema and constraints of the database. The Data Manipulation Language (DML) is used to search, retrieve, and modify the actual data stored within that schema. The most vital concept within this architecture is data independence, championed by Edgar Codd. This principle separates the logical level (the tables humans see) from the physical level (the bits stored on the machine). Because of this separation, an engineer can change the physical storage medium—from a hard drive to a data center or even DNA storage—without the user ever needing to change their queries.
These languages can be further categorized as imperative or declarative. Imperative languages require the programmer to specify *how* to perform a task (e.g., C++, Java), while declarative languages, most notably SQL, allow the user to specify *what* they want, leaving the "how" to the system's query optimizer. \dfn{Data Independence}{The ability of a database system to provide a stable logical view of data that is entirely independent of its physical storage implementation.}
\dfn{Metadata}{ \thm{Three-Tier Architecture}{A design pattern that divides an application into the presentation, logic, and data management layers to simplify development and maintenance.}
The structural information that defines the types and constraints of the data, essentially acting as a blueprint for the database.
}
\thm{Declarative Language Property}{ \section{Query Languages and Internal Processes}
The characteristic of languages like SQL that allows users to describe the desired result of a query without defining the physical execution steps or algorithms required to reach that result.
}
\section{Transaction Management and the ACID Test} Interaction with a DBMS occurs through specialized languages. The Data Definition Language (DDL) is used to define metadata—the "data about the data," such as the names of columns and their types. The Data Manipulation Language (DML), primarily SQL, is used to search for or update actual records.
A transaction is a single unit of work consisting of one or more database operations that must be treated as an indivisible whole. To maintain integrity, transactions must satisfy the ACID properties. SQL is distinct because it is a declarative language. In imperative languages like C++ or Python, a programmer must write the step-by-step instructions for how to perform a task. In a declarative language, the user only describes what result they want. The DBMS uses a query compiler to analyze the request and an execution engine to find the most efficient path—the "query plan"—to retrieve the data.
\dfn{Transaction}{ \nt{The efficiency of modern databases is largely due to the query compiler's ability to optimize a declarative request into a high-performance execution strategy.}
A program or set of actions that manages information and must be executed as an atomic unit to preserve database consistency.
}
\thm{The ACID Properties}{ \section{Measurement and Scaling in the Era of Big Data}
The fundamental requirements for reliable transaction processing:
\begin{itemize}
\item \textbf{Atomicity}: All-or-nothing execution; if any part of the transaction fails, the entire transaction is rolled back.
\item \textbf{Consistency}: Every transaction must leave the database in a state that satisfies all predefined rules and constraints.
\item \textbf{Isolation}: Each transaction must appear to execute in a vacuum, as if no other transactions are occurring simultaneously.
\item \textbf{Durability}: Once a transaction is completed, its effects are permanent and must survive any subsequent system failures.
\end{itemize}
}
\section{Database Roles and Ecosystems} The scale of data generated today is exponential, often said to double every few years. Engineers must be familiar with the international system of units for volume. While standard kilo and mega represent powers of 10 ($10^3$ and $10^6$), computer science often relies on binary prefixes like kibi ($2^{10} = 1024$) and mebi ($2^{20}$) to ensure precision in memory and storage calculations. We are now entering the age of Zettabytes and Yottabytes, requiring a deep understanding of how to scale information systems to meet these unprecedented demands.
A database environment involves several distinct roles. The Database Administrator (DBA) is responsible for coordination, monitoring, and access control. The Database Designer creates the structure and schema of the content. Power Users may interact with the system through complex programming, while Data Analysts use DML for updates and queries. Finally, Parametric Users interact with the database through simplified interfaces like menus and forms. \nt{The total amount of data created in just the last few years is estimated to be greater than the sum of all information produced in the entirety of previous human history.}
As systems grow, the challenge often becomes one of information integration. Large organizations may have many legacy databases that use different models or terms. Integration strategies include creating data warehouses—centralized repositories where data from various sources is translated and copied—or using mediators, which provide an integrated model of the data while translating requests for each individual source database.
\dfn{Data Warehouse}{
A centralized database used for reporting and data analysis, which stores integrated data from one or more disparate sources.
}
\thm{Legacy Database Problem}{
The difficulty of decommissioning old database systems because existing applications depend on them, necessitating the creation of integration layers to combine their data with newer systems.
}

View File

@@ -1,183 +1,108 @@
\chapter{Queries with SQL} \chapter{Queries with SQL}
\section{Overview of SQL Querying} Structured Query Language (SQL) serves as the primary interface for interacting with relational databases. While the Data Definition Language (DDL) handles the creation and modification of database structures, the Data Manipulation Language (DML) is used for the retrieval and modification of records. The querying aspect of SQL is essentially a high-level, declarative implementation of relational algebra. Because SQL is declarative, users specify the desired properties of the result set rather than the procedural steps required to compute it. This allows the database management system (DBMS) to utilize a query optimizer to determine the most efficient execution strategy, known as a query plan.
The Structured Query Language (SQL) serves as the primary interface for interacting with relational database management systems. At its core, SQL is a declarative language, meaning that a programmer specifies what data should be retrieved rather than providing the step-by-step procedure for how to find it. This approach allows the database's internal query optimizer to determine the most efficient path to the data, leveraging indexes and specific storage structures. SQL is fundamentally set-based, manipulating entire relations as units rather than processing individual records in a procedural loop. This aligns closely with the mathematical foundations of relational algebra, where operations like selection, projection, and join act on sets or bags of tuples. Modern SQL implementations typically follow a set-based or bag-based processing model. Under bag semantics, relations are treated as multisets where duplicate records are permitted, contrasting with the strict set theory used in pure relational algebra. SQL queries are processed by a query compiler that translates the high-level syntax into a tree of algebraic operators, such as selection, projection, join, and grouping.
The language is typically divided into two main components: the Data Definition Language (DDL) and the Data Manipulation Language (DML). DDL is concerned with the metadata and the structural level of the database, involving the creation, modification, and deletion of tables and columns. DML, which is the focus of this summary, operates at the record level. It allows for the insertion of new data, the updating of existing records, the deletion of information, and most importantly, the querying of the stored data. \dfn{Declarative Language}{A programming paradigm in which the programmer defines what the result should look like (the logic of the computation) without describing its control flow (the procedural steps).}
\dfn{SQL}{Structured Query Language, a declarative and set-based language used to define and manipulate data within a relational database management system.} \thm{Query Plan}{A structured sequence of internal operations, often represented as a tree of relational algebra operators, that the DBMS execution engine follows to produce the results of a query.}
\thm{Declarative Programming}{The principle where the programmer describes the desired result of a computation without specifying the control flow or algorithmic steps to achieve it.} \section{Basic Query Structure: SELECT-FROM-WHERE}
\section{Fundamental Selection and Projection} The fundamental building block of a SQL query is the select-from-where expression. This structure corresponds to the three most common operations in relational algebra: projection, relation selection, and tuple selection.
The most basic form of a SQL query follows the select-from-where structure. This construct allows a user to extract specific attributes from one or more tables based on certain criteria. To understand this in the context of relational algebra, we can view the three main clauses as distinct algebraic operators. The FROM clause identifies the source relations; if multiple relations are listed, it effectively represents a Cartesian product. The WHERE clause functions as a selection operator ($\sigma$), filtering the tuples produced by the product based on a logical predicate. Finally, the SELECT clause acts as a projection operator ($\pi$), narrowing the result set to only the desired columns. The \texttt{FROM} clause identifies the relations (tables) from which data is to be retrieved. This is conceptually the first step of the query, as it defines the scope of the data. The \texttt{WHERE} clause specifies a predicate used to filter the tuples. Only records that satisfy this logical condition are passed to the next stage. Finally, the \texttt{SELECT} clause identifies which attributes (columns) should be returned in the output. This is equivalent to the projection operator ($\pi$) in relational algebra. If a user wishes to retrieve all columns, a wildcard asterisk (*) is used.
In its simplest manifestation, a query can use the asterisk symbol (*) to denote that all columns from the source table should be included in the output. While this is useful for exploration, explicit projection is preferred in production environments to minimize data transfer and clarify the schema of the result set. Within the SELECT clause, we are not limited to just listing attributes; we can also include arithmetic expressions, such as calculating a value based on existing columns, or constants to provide context in the output. \thm{SELECT-FROM-WHERE Mapping}{A basic SQL query of the form \texttt{SELECT L FROM R WHERE C} is equivalent to the relational algebra expression $\pi_{L}(\sigma_{C}(R))$.}
\dfn{Query}{A formal request for information from a database, typically expressed in SQL to retrieve specific data matching a set of conditions.} \nt{In SQL, the select-list can include not only existing attributes but also constants and computed expressions, functioning like an extended projection.}
\thm{SQL to Relational Algebra Mapping}{A simple select-from-where query is equivalent to the relational algebra expression $\pi_L(\sigma_C(R_1 \times R_2 \times ... \times R_n))$, where L is the select list, C is the where condition, and $R_i$ are the relations in the from list.} \section{Logic, Comparisons, and Three-Valued Logic}
\section{Renaming and Aliasing} Filters in the \texttt{WHERE} clause are constructed using comparison operators such as equality (=), inequality (<> or !=), and range comparisons (<, >, <=, >=). SQL also supports pattern matching for strings through the \texttt{LIKE} operator, where the percent sign (%) matches any sequence of characters and the underscore (_) matches any single character.
When a query is executed, the resulting relation has column headers that defaults to the names of the attributes in the source tables. However, there are many cases where these names may be ambiguous or uninformative, particularly when calculations are involved. SQL provides the `AS` keyword to assign an alias to a column or an expression. This allows the programmer to rename the output for better readability or to comply with the requirements of an application. A critical aspect of SQL logic is the treatment of \texttt{NULL} values. Because a \texttt{NULL} represents an unknown or missing value, comparing anything to \texttt{NULL} results in a truth value of \texttt{UNKNOWN}. This necessitates a three-valued logic system.
Aliases can also be applied to relations in the FROM clause. These are referred to as tuple variables or correlation names. Aliasing a relation is essential when a query must compare different rows within the same table, a process known as a self-join. By assigning different aliases to the same table, the query can treat them as two distinct sources of data, enabling comparisons between tuples. \dfn{Three-Valued Logic}{A logical framework where expressions can evaluate to TRUE, FALSE, or UNKNOWN, specifically required to handle comparisons involving NULL values.}
\dfn{Alias}{A temporary name assigned to a table or a column within the scope of a single SQL query to improve clarity or disambiguate references.} The behavior of logical operators under three-valued logic follows specific rules:
\section{String Patterns and Comparison Operators} \begin{itemize}
\item \textbf{AND}: The result is TRUE only if both operands are TRUE. If one is FALSE, the result is FALSE regardless of the other. If one is TRUE and the other is UNKNOWN, the result is UNKNOWN.
\item \textbf{OR}: The result is TRUE if at least one operand is TRUE. If one is TRUE, the result is TRUE regardless of the other. If one is FALSE and the other is UNKNOWN, the result is UNKNOWN.
\item \textbf{NOT}: The negation of UNKNOWN remains UNKNOWN.
\end{itemize}
SQL provides a robust set of comparison operators to filter data within the WHERE clause. These include equality (=), inequality (<>), and various ordering comparisons (<, >, <=, >=). While numeric comparisons are straightforward, string comparisons follow lexicographical order. \nt{The \texttt{WHERE} clause only retains tuples for which the predicate evaluates to TRUE. Records that evaluate to FALSE or UNKNOWN are filtered out.}
For more flexible string matching, the `LIKE` operator is used with patterns. This operator allows for partial matches using two special wildcard characters: the percent sign (%) and the underscore (_). The percent sign matches any sequence of zero or more characters, while the underscore matches exactly one character. This is particularly useful for finding records where only a portion of a string is known or for identifying specific substrings. \section{Ordering and Limiting Results}
\dfn{Predicate}{A logical expression that evaluates to true, false, or unknown, used in the WHERE clause to determine which tuples satisfy the query criteria.} While relational algebra results are conceptually unordered sets or bags, SQL allows users to impose a specific order on the output using the \texttt{ORDER BY} clause. Sorting can be performed in ascending (\texttt{ASC}, the default) or descending (\texttt{DESC}) order. Multiple columns can be specified to handle ties.
\thm{Lexicographical Comparison}{The method of ordering strings based on the alphabetical order of their component characters, where a string is "less than" another if it appears earlier in a dictionary.} Furthermore, SQL provides mechanisms to limit the size of the result set, which is particularly useful for performance and pagination. The \texttt{LIMIT} clause restricts the total number of rows returned, while the \texttt{OFFSET} clause skips a specified number of rows before beginning to return results.
\section{Handling Incomplete Information with Null Values} \thm{List Semantics}{When an \texttt{ORDER BY} clause is applied, the result set is treated as a list rather than a bag, meaning the sequence of records is guaranteed and meaningful for the application.}
In real-world databases, it is common for certain pieces of information to be missing or inapplicable. SQL represents this missing data with a special marker called `NULL`. It is important to recognize that `NULL` is not a value in the same way 0 or an empty string is; it is a placeholder indicating the absence of a value. \section{Multi-Relation Queries and Joins}
Because `NULL` represents unknown data, comparisons involving `NULL` cannot result in a standard true or false. Instead, SQL employs a three-valued logic system that includes `UNKNOWN`. For example, if we compare a column containing a `NULL` to a constant, the result is `UNKNOWN`. To explicitly check for these placeholders, SQL provides the `IS NULL` and `IS NOT NULL` operators. Standard equality comparisons like `= NULL` will always evaluate to `UNKNOWN` and therefore fail to filter the desired records. SQL allows queries to involve multiple relations by listing them in the \texttt{FROM} clause. When multiple tables are listed without a joining condition, the result is a Cartesian product, where every tuple from the first relation is paired with every tuple from the second.
\dfn{NULL}{A special marker in SQL used to indicate that a data value does not exist in the database, either because it is unknown or not applicable.} To perform meaningful combinations, join conditions must be specified. These conditions link related data across tables, typically by equating a primary key in one table with a foreign key in another. If attribute names are identical across tables, they must be disambiguated using the table name or a tuple variable (alias).
\thm{Three-Valued Logic}{A system of logic where expressions can evaluate to TRUE, FALSE, or UNKNOWN, requiring specialized truth tables for AND, OR, and NOT operations.} \dfn{Tuple Variable (Alias)}{A temporary name assigned to a table in the \texttt{FROM} clause, used to shorten queries, disambiguate column references, or allow a table to be joined with itself (self-join).}
\section{Logic and Truth Tables in SQL} SQL provides explicit join syntax as an alternative to the comma-separated list in the \texttt{FROM} clause:
The presence of `UNKNOWN` values necessitates a clear understanding of how logical operators behave. When combining conditions with `AND`, the result is the minimum of the truth values, where TRUE is 1, UNKNOWN is 0.5, and FALSE is 0. Conversely, `OR` takes the maximum of the truth values. The `NOT` operator subtracts the truth value from 1. \begin{itemize}
\item \textbf{CROSS JOIN}: Produces the Cartesian product.
\item \textbf{INNER JOIN}: Returns only the tuples that satisfy the join condition.
\item \textbf{NATURAL JOIN}: Automatically joins tables based on all columns with matching names and removes the redundant duplicate column.
\item \textbf{OUTER JOIN}: Preserves "dangling tuples" that do not have a match in the other relation, padding the missing values with \texttt{NULL}. These come in \texttt{LEFT}, \texttt{RIGHT}, and \texttt{FULL} varieties.
\end{itemize}
In the context of a WHERE clause, a tuple is only included in the final result set if the entire condition evaluates to `TRUE`. Tuples for which the condition is `FALSE` or `UNKNOWN` are excluded. This behavior can lead to unintuitive results, such as a query for "all records where X is 10 OR X is not 10" failing to return records where X is `NULL`, because the result of that OR operation would be `UNKNOWN`. \nt{The \texttt{USING} clause is a safer alternative to \texttt{NATURAL JOIN} as it allows the user to explicitly specify which columns with shared names should be used for the join, preventing accidental matches on unrelated columns.}
\dfn{Truth Table}{A mathematical table used to determine the result of logical operations given all possible combinations of input truth values.} \section{Subqueries and Nesting}
\section{Multi-Relation Queries and the Cartesian Product} SQL is highly recursive, allowing queries to be nested within other queries. A subquery can appear in the \texttt{WHERE}, \texttt{FROM}, or \texttt{SELECT} clauses.
When a query involves data spread across multiple tables, the FROM clause lists all the relevant relations. The logical starting point for such a query is the Cartesian product, which pairs every tuple from the first relation with every tuple from the second, and so on. This produces a very large intermediate relation where each row represents a potential combination of the source data. Subqueries that return a single row and a single column are called scalar subqueries and can be used anywhere a constant is expected. Subqueries that return a single column (a list of values) can be used with operators like \texttt{IN}, \texttt{ANY}, or \texttt{ALL}. The \texttt{EXISTS} operator is used to check if a subquery returns any results at all.
To make this product useful, the WHERE clause must contain join conditions that link the relations based on common attributes. For instance, if we are joining a 'Movies' table with a 'Producers' table, we might equate the 'producerID' column in both. This filtering process discards the vast majority of the Cartesian product, leaving only the rows where the related data actually matches. When attributes in different tables share the same name, we use the dot notation (e.g., TableName.AttributeName) to disambiguate the references. \thm{Correlated Subquery}{A subquery that references an attribute from the outer query. It conceptually requires the subquery to be evaluated once for every row processed by the outer query.}
\thm{The Join-Selection Equivalence}{The principle that a natural join or an equijoin can be logically expressed as a selection performed on a Cartesian product of relations.} \section{Duplicate Elimination and Set Operations}
\section{Interpretation of Multi-Relation Queries} Because SQL defaults to bag semantics, it often produces duplicate rows. The \texttt{DISTINCT} keyword can be added to the \texttt{SELECT} clause to force set semantics by removing these duplicates.
There are multiple ways to interpret the execution of a query involving several relations. One helpful mental model is the "nested loops" approach. In this model, we imagine a loop for each relation in the FROM clause. The outermost loop iterates through every tuple of the first relation, and for each of those, the next loop iterates through the second relation, and so on. Inside the innermost loop, the WHERE condition is tested against the current combination of tuples. If the condition is met, the SELECT clause produces an output row. SQL also supports standard set operations: \texttt{UNION}, \texttt{INTERSECT}, and \texttt{EXCEPT} (or \texttt{MINUS}). By default, these operations eliminate duplicates. If bag semantics are desired, the \texttt{ALL} keyword must be appended (e.g., \texttt{UNION ALL}).
Another interpretation is based on parallel assignment. In this view, we consider all possible assignments of tuples to the variables representing the relations. We then filter for those assignments that satisfy the condition. While the nested loop model is more algorithmic, the parallel assignment model highlights the declarative nature of the query, emphasizing that the order of the relations in the FROM clause should not, in theory, affect the result. \nt{Duplicate elimination is a computationally expensive operation because it requires sorting or hashing the entire result set to identify matching tuples.}
\dfn{Tuple Variable}{A variable that ranges over the tuples of a relation, often implicitly created for each table in the FROM clause or explicitly defined as an alias.} \section{Aggregation and Grouping}
\section{Set Operators and Bag Semantics} Aggregation allows users to summarize large volumes of data into single representative values. SQL provides five standard aggregate functions: \texttt{COUNT}, \texttt{SUM}, \texttt{AVG}, \texttt{MIN}, and \texttt{MAX}.
SQL provides operators for the traditional set-theoretic actions: `UNION`, `INTERSECT`, and `EXCEPT`. These allow the results of two queries to be combined, provided they have the same schema (compatible attribute types and order). By default, these operators follow set semantics, meaning that they automatically eliminate duplicate tuples from the result. The \texttt{GROUP BY} clause partitions the data into groups based on the values of one or more columns. Aggregate functions are then applied to each group independently. A critical restriction exists when using grouping: any column appearing in the \texttt{SELECT} list that is not part of an aggregate function must be included in the \texttt{GROUP BY} clause.
However, since SQL is fundamentally based on bags (multisets), it also provides versions of these operators that preserve duplicates using the `ALL` keyword. `UNION ALL` simply concatenates the two result sets. `INTERSECT ALL` produces a tuple as many times as it appears in both inputs (taking the minimum count). `EXCEPT ALL` produces a tuple as many times as it appears in the first input minus the number of times it appears in the second (taking the difference). Using bag semantics is often more efficient because the system does not need to perform the expensive work of sorting or hashing the data to find and remove duplicates. \thm{The Aggregation Rule}{In a query using grouping, the output can only consist of the attributes used for grouping and the results of aggregate functions applied to the groups.}
\dfn{Bag}{A collection of elements that allows for multiple occurrences of the same element, where the order of elements remains immaterial.} For filtering data after it has been aggregated, SQL uses the \texttt{HAVING} clause. Unlike \texttt{WHERE}, which filters individual rows before they are grouped, \texttt{HAVING} filters the groups themselves based on aggregate properties.
\thm{Closure of Bag Operations}{The property that the result of any operation on bags is also a bag, ensuring that the relational model remains consistent through complex sequences of operations.} \dfn{Aggregate Function}{A function that takes a collection of values as input and returns a single value as a summary, such as a total or an average.}
\section{Nested Queries and Scalar Subqueries} \section{Advanced Table Expressions and Common Table Expressions}
A subquery is a query nested within another query. Subqueries can appear in various parts of a SQL statement, including the WHERE, FROM, and HAVING clauses. A scalar subquery is one that returns exactly one row and one column—a single value. Because it evaluates to a scalar, it can be used anywhere a constant or an attribute would be valid, such as in a comparison. To improve query readability and maintainability, SQL provides mechanisms to define temporary relations within a single query. The \texttt{VALUES} clause can be used to construct a constant table on the fly. More importantly, the \texttt{WITH} clause allows for the definition of Common Table Expressions (CTEs).
If a scalar subquery is designed to find, for instance, the specific ID of a person, and the data actually contains two people with that name, the query will fail at runtime. The system expects a single value and cannot resolve the ambiguity. If the subquery returns no rows, it is treated as a `NULL`. \thm{Common Table Expression (CTE)}{A temporary named result set that exists only within the scope of a single query, providing a way to decompose complex queries into smaller, logical steps.}
\dfn{Scalar}{A single atomic value, such as an integer or a string, as opposed to a collection of values like a row or a table.} CTEs can also be recursive, allowing SQL to perform operations that are impossible in standard relational algebra, such as computing the transitive closure of a graph (e.g., finding all reachable cities in a flight network).
\section{Conditions on Relations: IN and EXISTS} \nt{The \texttt{WITH RECURSIVE} statement typically consists of a base case (non-recursive query) and a recursive step joined by a \texttt{UNION} operator.}
When a subquery returns a set of values rather than a single scalar, it can be used with relational operators like `IN`. The expression `x IN (subquery)` evaluates to true if the value of x is found in the result set produced by the subquery. This is a powerful way to filter data based on membership in a dynamically calculated set. \section{Conclusion on Query Logic}
The `EXISTS` operator is another tool for dealing with subqueries. It takes a subquery as an argument and returns true if the subquery returns at least one row. Unlike `IN`, `EXISTS` does not look at the actual values returned; it only checks for the existence of results. This is often used in correlated subqueries to check for the presence of related records in another table. Querying with SQL represents a bridge between high-level human requirements and mathematical relational theory. By understanding the underlying relational algebra—selection, projection, products, and joins—users can write more efficient and accurate queries. The complexity of SQL arises from its need to handle real-world data nuances, such as missing information (\texttt{NULL}s) and the desire for summarized reports (aggregation). Mastering the order of operations—starting from the \texttt{FROM} clause, moving through \texttt{WHERE} and \texttt{GROUP BY}, and finally reaching \texttt{SELECT}, \texttt{HAVING}, and \texttt{ORDER BY}—is essential for any database engineer.
\thm{Existence Quantification}{The logical principle of checking whether there is at least one element in a set that satisfies a given property, implemented in SQL via the EXISTS operator.} The relationship between SQL and its execution can be viewed as a translation process: the user speaks in "declarative" desires, while the database engine converts those desires into a "procedural" query plan, much like a chef translating a customer's order into a sequence of kitchen tasks.
\section{Correlated Subqueries and Scoping}
A correlated subquery is a nested query that refers to attributes of the outer query. Because of this dependency, the subquery must, in concept, be re-evaluated for every row processed by the outer query. This creates a link between the two levels of the query, allowing for complex logic like "find all employees whose salary is higher than the average salary in their specific department."
Scoping rules in SQL dictate how attribute names are resolved. An attribute in a subquery will first be looked for in the tables mentioned in that subquery's own FROM clause. If it is not found there, the system looks at the FROM clause of the next level out, and so on. If the same attribute name appears in multiple levels, we must use aliases to ensure the correct column is referenced. Correlated subqueries are often more expressive than simple joins but can be more computationally expensive if the optimizer cannot unnest them into a join.
\dfn{Correlated Subquery}{A subquery that depends on the current row being processed by the outer query, identified by references to attributes defined in the outer scope.}
\section{Join Expressions and Syntax Variants}
While the select-from-where structure can express most joins, SQL also provides explicit join syntax. A `CROSS JOIN` is a direct representation of the Cartesian product. A `JOIN ... ON` allows the join condition to be specified explicitly in the FROM clause, which many developers find clearer than placing the condition in the WHERE clause.
A `NATURAL JOIN` is a specialized form of join that automatically equates all columns with the same name in both tables and removes the redundant copies of those columns. While natural joins are concise, they can be risky because they depend on attribute names. If a schema change adds a column to one table that happens to share a name with a column in another, the natural join logic will change automatically and potentially break the query. The `USING` clause provides a middle ground, allowing the user to specify exactly which common columns should be used for the join.
\dfn{Natural Join}{A join operation that matches tuples based on all attributes that have the same name in both relations, producing a result that contains only one copy of each common attribute.}
\section{Outer Joins and Data Preservation}
In a standard (inner) join, tuples that do not have a match in the other table are discarded. These are called "dangling tuples." If we wish to preserve these tuples in our result set, we use an `OUTER JOIN`. There are three types: `LEFT`, `RIGHT`, and `FULL`. A `LEFT OUTER JOIN` includes all tuples from the left table; if a tuple has no match in the right table, the columns from the right table are filled with `NULL`.
The `RIGHT OUTER JOIN` is symmetric, preserving all rows from the right table. A `FULL OUTER JOIN` preserves all rows from both tables, ensuring that no information from either source is lost. Outer joins are essential when we need a comprehensive list of items, even if some of those items lack certain related data.
\dfn{Dangling Tuple}{A tuple in one relation that does not match any tuple in another relation based on the join criteria.}
\thm{The Outer Join Property}{The guarantee that all tuples of the specified operand relations will be represented in the result, with NULL values used to fill in missing components for non-matching rows.}
\section{Aggregation and Data Summarization}
SQL includes several built-in functions to perform calculations across entire columns of data. These are known as aggregation operators. The five standard operators are `SUM`, `AVG`, `MIN`, `MAX`, and `COUNT`. `SUM` and `AVG` can only be applied to numeric data, while `MIN` and `MAX` can also be applied to strings (using lexicographical order) or dates.
The `COUNT` operator is versatile; `COUNT(*)` counts every row in a table, while `COUNT(attribute)` counts only the non-null values in that specific column. If we wish to count only the unique values, we can use the `DISTINCT` keyword inside the aggregation, such as `COUNT(DISTINCT studioName)`. It is vital to remember that all aggregations except for `COUNT` return `NULL` if they are applied to an empty set of values. `COUNT` returns 0 for an empty set.
\dfn{Aggregation}{The process of summarizing multiple values into a single value through functions like summation or averaging.}
\section{Grouping and Partitioning}
The `GROUP BY` clause allows us to partition the rows of a relation into groups based on their values in one or more attributes. When a query contains a `GROUP BY` clause, the SELECT clause is limited in what it can contain. Every attribute listed in the SELECT clause must either be an attribute used for grouping or be part of an aggregate function.
Conceptually, the system first creates the groups and then applies the aggregate functions to each group independently. The result is a single row for each unique combination of values in the grouping attributes. This is the primary way to generate reports and statistics, such as "the total number of movies produced by each studio per year."
\dfn{Grouping Attribute}{An attribute used in the GROUP BY clause to define the partitions upon which aggregation functions will operate.}
\section{Post-Aggregation Filtering with HAVING}
Sometimes we want to filter the results of a query based on an aggregate value. However, the WHERE clause is evaluated before any grouping or aggregation takes place. Therefore, we cannot use a condition like `WHERE SUM(length) > 500`. To solve this, SQL provides the `HAVING` clause.
The `HAVING` clause is evaluated after the groups have been formed and the aggregations have been calculated. It allows the programmer to specify conditions that apply to the group as a whole. Only the groups that satisfy the `HAVING` condition will appear in the final output. While `HAVING` can technically contain any condition, it is best practice to only use it for conditions involving aggregates, leaving all tuple-level filtering to the WHERE clause.
\dfn{HAVING}{A clause in SQL used to specify conditions that filter groups of rows created by the GROUP BY clause, typically involving aggregate functions.}
\thm{Query Execution Order}{The logical sequence of operations in a SQL query: FROM (and JOINs), then WHERE, then GROUP BY, then HAVING, and finally SELECT (and DISTINCT) and ORDER BY.}
\section{Ordering and Sorting the Result}
The final step in many queries is to present the data in a specific order for the user. The `ORDER BY` clause facilitates this, allowing for sorting by one or more columns in either ascending (`ASC`) or descending (`DESC`) order. Sorting is the last operation performed before the data is returned; even if a column is not projected in the SELECT clause, it can still be used for sorting if it was available in the source tables.
If multiple columns are listed in the `ORDER BY` clause, the system sorts by the first column first. If there are ties, it uses the second column to break them, and so on. This ensures a deterministic and readable presentation of the retrieved information.
\dfn{Sorting}{The process of arranging the rows of a result set in a specific sequence based on the values of one or more attributes.}
\section{Extended Projection and Constants}
The extended projection operator allows for more than just choosing columns. It enables the use of expressions that combine attributes or apply functions to them. In SQL, this is manifested in the SELECT list, where we can perform additions, concatenations, or even call stored functions.
Constants are also frequently used in the SELECT list. For example, a query might select "Movie", title, year from a table. Every resulting row would have the string literal "Movie" as its first column. This is often used to label different parts of a union or to provide fixed formatting for an external application.
\thm{Functional Dependency in Aggregation}{The rule that in a grouped query, any attribute in the SELECT list that is not aggregated must be functionally determined by the grouping attributes to ensure the result is well-defined.}
\section{Nested Queries in the FROM Clause}
SQL allows a subquery to be placed in the FROM clause. In this case, the subquery acts as a temporary table that exists only for the duration of the outer query. This is particularly useful when we need to perform multiple levels of aggregation or when we want to join a table with a summarized version of itself.
When a subquery is used in the FROM clause, it must be assigned an alias. This alias allows the outer query to refer to the columns produced by the subquery. This technique is often a cleaner alternative to using complex correlated subqueries in the WHERE clause, as it makes the flow of data more explicit.
\dfn{Derived Table}{A temporary result set returned by a subquery in the FROM clause, which is then used by the outer query as if it were a physical table.}
\section{Summary of Advanced SQL Syntax}
Throughout our exploration of Chapters 6 and the accompanying presentation, we have seen that SQL is far more than a simple tool for data retrieval. Its ability to nest logic, perform complex aggregations across partitioned data, and handle various join types allows it to solve sophisticated data analysis problems. The transition from the mathematical abstractions of relational algebra to the practical syntax of SQL reveals how each keyword serves a specific logical function in the data-processing pipeline.
By understanding the declarative nature of the language and the underlying bag semantics, developers can write queries that are not only correct but also efficient. The careful management of NULLs, the strategic use of subqueries, and the mastery of grouping and having clauses form the foundation of expert database programming. This comprehensive summary has detailed the syntax and the theoretical justifications for the most critical features of SQL querying, providing a roadmap for complex data manipulation.
\thm{The Universal Query Form}{The select-from-where block is the universal building block of SQL, capable of expressing any operation that can be represented by the core operators of relational algebra.}

View File

@@ -1,67 +1,83 @@
\chapter{Relational Algebra} \chapter{Relational Algebra}
Relational algebra serves as the formal foundation for the manipulation of data within the relational model. It is a procedural language consisting of a set of operators that take one or more relations as input and produce a new relation as output. This mathematical framework is essential for database systems because it provides a precise way to represent queries and allows the system's query optimizer to reorganize expressions into more efficient execution plans. Unlike general-purpose programming languages such as C or Java, relational algebra is intentionally limited in power. For example, it cannot perform arbitrary calculations like factorials or determine if the number of tuples in a relation is even or odd. However, this limitation is a strategic advantage, as it enables the database engine to perform high-level optimizations that would be impossible in a more complex language. Relational algebra serves as the formal mathematical foundation for manipulating data within the relational model. While Data Definition Language (DDL) is concerned with the static structure of the database, relational algebra provides the dynamic framework for the Data Manipulation Language (DML). It is a notation consisting of a set of operators that take one or more relations as input and produce a new relation as output. This operational approach allows for the expression of complex queries by nesting and combining simpler operations.
The algebra is characterized by the property of closure, where every result is a relation that can immediately serve as an input for another operation. Operators are generally categorized into unary operators, which act on a single relation, and binary operators, which combine two relations. The core operations include set-based maneuvers—such as union, intersection, and difference—and relational-specific maneuvers like selection, projection, joins, and renaming. In modern database implementations, these concepts are often extended to support bag semantics (allowing duplicates) and complex operations like grouping and sorting. The power of relational algebra lies in its ability to abstract away from the physical storage of data, focusing instead on the logical transformation of information. It is essentially to relational tables what basic arithmetic is to numbers or matrix algebra is to vectors. By defining precise rules for how tables are filtered, combined, and restructured, it ensures that query results are predictable and mathematically sound.
\dfn{Relational Algebra}{A collection of mathematical operators that function on relations to perform data retrieval and manipulation. It is closed under its operations, ensuring that the output of any expression is itself a relation.} \section{The Concept of Relational Variables and Closure}
\thm{The Power of Optimization}{By limiting the expressive power of the query language to relational algebra, database systems can effectively optimize code. This allows the system to replace inefficient execution strategies with mathematically equivalent but significantly faster algorithms.} A fundamental aspect of relational algebra is the way it identifies and treats data structures. In most mathematical contexts, an object does not inherently know the variable name assigned to it. However, in database theory, we often use the term "relvar" (relational variable) to describe a relation that is explicitly associated with a name. This allows the system to refer to stored data and intermediate results throughout the execution of a query.
\section{The Unary Operators: Selection and Projection} Another critical property of relational algebra is closure. This principle dictates that because the input and output of every algebraic operator is a relation, operators can be nested indefinitely. This is identical to how the addition of two integers always results in another integer, allowing for the construction of complex arithmetic expressions.
Selection and projection are the primary filters used to reduce the size of a relation. Selection, denoted by the Greek letter sigma ($\sigma$), acts as a horizontal filter. It examines each tuple in a relation and retains only those that satisfy a specific logical condition. This condition can involve attribute comparisons to constants or other attributes using standard operators such as equality, inequality, and logical connectors like AND and OR. \thm{The Property of Closure}{In relational algebra, the result of any operation is always another relation. This ensures that the output of one operator can serve as the valid input for any subsequent operator in a query tree.}
Projection, denoted by the Greek letter pi ($\pi$), acts as a vertical filter. It is used to choose specific columns from a relation while discarding others. In its classical set-based form, projection also serves to eliminate any duplicate tuples that may arise when certain attributes are removed. This ensures that the result remains a valid set. Extended versions of projection allow for the creation of new attributes through calculations or renamings of existing fields. \dfn{Relational Variable (Relvar)}{A relvar is a named variable that is assigned a specific relation as its value, effectively allowing the database to track and manipulate data through an explicit identifier.}
\dfn{Selection}{An operation $\sigma_C(R)$ that produces a relation containing all tuples from $R$ that satisfy the condition $C$. It does not change the schema of the relation but reduces the number of rows.} \section{Unary Operators: Selection, Projection, and Renaming}
\dfn{Projection}{An operation $\pi_L(R)$ that creates a new relation consisting of only the attributes listed in $L$. It transforms the schema of the relation and may reduce the number of rows if duplicates are removed.} Unary operators are those that act upon a single relation. The three primary unary operators are selection, projection, and renaming. These tools allow a user to isolate specific rows, columns, or change the labels of the data structure.
\section{Set Operations and Compatibility Constraints} Selection, denoted by the Greek letter sigma ($\sigma$), acts as a horizontal filter. It extracts only those records (tuples) that satisfy a specific condition, known as a predicate. This predicate can involve logical comparisons, arithmetic, and boolean operators. Importantly, selection does not change the schema of the table; the output has the exact same attributes and domains as the input.
Relational algebra incorporates standard set operations: union ($\cup$), intersection ($\cap$), and difference ($-$). Because these operations are inherited from set theory, they require the participating relations to be "compatible." This means the relations must share the same schema—specifically, they must have the same set of attributes, and the domains (data types) associated with corresponding attributes must be identical. Projection, denoted by the Greek letter pi ($\pi$), serves as a vertical filter. It allows a user to choose a specific subset of attributes from a relation, discarding the rest. Since the output is still a relation (under set semantics), any duplicate rows that might appear because of the removal of identifying columns must be eliminated.
Union combines all tuples from two relations into a single result. Intersection identifies tuples that appear in both input relations. Difference, which is not commutative, returns tuples found in the first relation but not the second. While these were originally defined for sets, modern systems often apply them to "bags" (multisets), where the rules for handling duplicates differ. For instance, in bag union, the number of occurrences of a tuple is the sum of its occurrences in the inputs, whereas in set union, it appears only once. Renaming, denoted by the Greek letter rho ($\rho$), does not change the data within a relation but alters the metadata. It can be used to change the name of the relation itself or the names of specific attributes. This is often necessary when joining a table with itself or preparing for set operations where attribute names must match.
\dfn{Schema Compatibility}{The requirement that two relations involved in a set operation must have the same number of attributes, with matching names and identical data types for each corresponding column.} \dfn{Selection ($\sigma$)}{The selection operator identifies and retrieves a subset of tuples from a relation that meet a defined logical condition.}
\thm{Commutativity and Associativity}{Set union and intersection are both commutative ($R \cup S = S \cup R$) and associative $((R \cup S) \cup T = R \cup (S \cup T))$, allowing the system to reorder these operations for better performance.} \dfn{Projection ($\pi$)}{The projection operator creates a new relation consisting only of a specified subset of attributes from the original relation.}
\section{Renaming and Relational Variables} \nt{In modern query processors, an extended version of projection is often used. This allows not only the selection of attributes but also the creation of new columns through calculations or string manipulations based on existing data.}
In complex queries, it is often necessary to change the name of an attribute or the relation itself to avoid ambiguity or to prepare a relation for a set operation. The renaming operator, denoted by the Greek letter rho ($\rho$), allows for this modification. This is particularly useful when joining a relation with itself, as it provides a way to distinguish between the two copies. \section{Binary Set Operations}
The concept of a "relvar" (relational variable) is also important here. A relvar is essentially a variable that has a name and is assigned a specific relation. In algebraic expressions, we use these names to refer to the data stored within the tables. Relational algebra incorporates traditional set theory operations, including union ($\cup$), intersection ($\cap$), and subtraction ($-$). However, these cannot be applied to any two arbitrary tables. They require the operands to be "union-compatible." This means the two relations must share the exact same set of attributes, and each corresponding attribute must share the same domain.
\dfn{Renaming}{An operator $\rho_S(R)$ that returns a new relation identical to $R$ in content but renamed to $S$. It can also be used as $\rho_{S(A_1, ..., A_n)}(R)$ to rename individual attributes.} \thm{Rules for Set Operations}{For two relations $R$ and $S$ to participate in a union, intersection, or difference, they must:
\begin{enumerate}
\item Possess the same set of attributes.
\item Have identical domains for each corresponding attribute.
\item (In mathematical relations) Maintain the same order of attributes to satisfy Cartesian product rules.
\end{enumerate}}
\section{Combining Relations: Products and Joins} The union of $R$ and $S$ includes all tuples that appear in either $R$, $S$, or both. The intersection includes only those tuples found in both relations. Subtraction (or set difference) retrieves tuples that are present in the first relation but not the second. It is important to note that while union and intersection are commutative, subtraction is not; the order of operands changes the result.
The most complex operations in relational algebra involve combining information from different relations. The Cartesian Product ($\times$) is the most basic of these, pairing every tuple of the first relation with every tuple of the second. While mathematically simple, the product often produces very large relations that contain many irrelevant pairings. \section{Joining Relations: Products and Joins}
Joins are more refined versions of the product. The Theta-Join ($\bowtie_C$) performs a product followed by a selection based on a specific condition $C$. The most common join is the Natural Join ($\bowtie$), which automatically pairs tuples that have equal values in all attributes shared by the two relations. After the pairing, it removes the redundant columns, leaving a cleaner result. Combining data from different relations is achieved through joining operations. The most basic of these is the Cartesian Product ($\times$), which pairs every tuple of one relation with every tuple of another. While mathematically simple, this operation is computationally expensive and rarely used alone in practice, as it creates a massive amount of often irrelevant data.
\dfn{Cartesian Product}{A binary operator $R \times S$ that produces a relation whose schema is the union of the schemas of $R$ and $S$, and whose tuples are all possible concatenations of a tuple from $R$ and a tuple from $S$.} To make combinations more meaningful, we use the Join operator ($\bowtie$). The natural join looks for attributes common to both relations and pairs tuples only when they share identical values for those common attributes. A more general version is the Theta-join, which pairs tuples based on an arbitrary condition (such as "greater than" or "not equal") rather than just simple equality.
\dfn{Natural Join}{A specific type of join $R \bowtie S$ that connects tuples based on equality across all common attributes and then projects out the duplicate columns.} \thm{The Equivalence of Theta-Joins}{Any Theta-join can be expressed as a Cartesian product followed immediately by a selection operation. Formally: $R \bowtie_{\theta} S = \sigma_{\theta}(R \times S)$.}
\thm{The Join-Product Relationship}{A theta-join $R \bowtie_C S$ is mathematically equivalent to the expression $\sigma_C(R \times S)$. This relationship allows query optimizers to choose between different physical execution strategies for the same logical request.} \nt{A "dangling tuple" refers to a record that does not find a match in the other relation during a join. In standard joins, these tuples are discarded from the result.}
\section{Linear Notation and Expression Trees} \section{Bag Semantics and Extended Relational Algebra}
Because relational algebra is a functional language, complex queries are built by nesting operations. These can be represented in two main ways. Linear notation involves a sequence of assignments to temporary variables, making the steps of a query easier to read. Alternatively, expression trees provide a graphical representation where leaves are stored relations and interior nodes are algebraic operators. While mathematical relational algebra assumes set semantics (where every element is unique), real-world systems like SQL often utilize bag semantics. In a bag, the same tuple can appear multiple times. This affects how set operations are calculated. For example, in a bag union, if a tuple appears $m$ times in $R$ and $n$ times in $S$, it will appear $m+n$ times in the result.
The query processor uses these trees to visualize the flow of data. By applying algebraic laws, the processor can "push" selections and projections down the tree, closer to the data sources. This reduces the size of intermediate relations as early as possible, which is a hallmark of efficient query execution. Extended relational algebra introduces operators to handle these practical requirements. These include the duplicate elimination operator ($\delta$), which turns a bag into a set, and the sorting operator ($\tau$), which treats the relation as a list to arrange tuples by specific values.
\thm{Selection Pushing}{In a query tree, moving a selection $\sigma$ below other operators like joins or unions is almost always beneficial, as it reduces the number of tuples that subsequent, more expensive operators must process.} The grouping and aggregation operator, denoted by gamma ($\gamma$), is perhaps the most powerful extended operator. It partitions tuples into groups based on "grouping keys" and applies an aggregate function—such as SUM, AVG, MIN, MAX, or COUNT—to each group.
\section{Extended Relational Algebra} \dfn{Aggregate Function}{A function that summarizes a collection of values from a column to produce a single representative value, such as a total or an average.}
To meet the requirements of SQL, the basic algebra is often extended with additional operators. These include duplicate elimination ($\delta$), which explicitly turns a bag into a set; sorting ($\tau$), which orders the tuples of a relation; and grouping and aggregation ($\gamma$), which partitions tuples into groups and calculates summaries like sums or averages. \section{Relational Algebra as a Constraint Language}
While these operators go beyond the original mathematical definition of the algebra, they are essential for practical database management. They allow the algebra to serve as a complete intermediate language for translating SQL queries into physical instructions for the machine. Relational algebra is not just for querying; it can also be used to define the rules that data must follow to be considered valid. These constraints ensure the integrity of the database. We can express any constraint by stating that a specific algebraic expression must result in an empty set ($\emptyset$), or that the result of one expression must be a subset of another.
\dfn{Duplicate Elimination}{The operator $\delta(R)$ that takes a bag $R$ as input and returns a set containing exactly one copy of every distinct tuple found in the input.} Key constraints can be represented by showing that if we join a table with itself and find two records with the same key but different attribute values, the set of such instances must be empty. Referential integrity (foreign keys) is expressed by asserting that the projection of a foreign key column in one table must be a subset of the projection of the primary key column in the referenced table.
\dfn{Aggregation}{The application of functions such as SUM, AVG, MIN, MAX, or COUNT to a column of a relation to produce a single summary value.} \thm{Referential Integrity Constraint}{In relational algebra, referential integrity is enforced by the subset inclusion: $\pi_{A}(R) \subseteq \pi_{B}(S)$, meaning every value of attribute $A$ in relation $R$ must exist in the set of values of attribute $B$ in relation $S$.}
\section{Relational Algebra and Database Modifications}
The concepts of relational algebra also extend to how we modify the database state.
\begin{itemize}
\item \textbf{Deletion}: Removing tuples from a relation $R$ can be modeled as $R := R - \sigma_C(R)$, where $C$ is the deletion condition.
\item \textbf{Insertion}: Adding tuples is modeled as $R := R \cup S$, where $S$ is the set of new tuples.
\item \textbf{Update}: Updating a tuple is logically equivalent to deleting the old version and inserting a new one with modified values.
\end{itemize}
By viewing modifications through this lens, the system can use algebraic laws to optimize not only how we retrieve data, but also how we maintain it.

View File

@@ -1,131 +1,83 @@
\chapter{The Relational Model} \chapter{The Relational Model}
The relational model stands as the preeminent framework for managing data in contemporary information systems. Historically, the organization of data in tabular formats is a practice that stretches back nearly four millennia, beginning with early clay tablets. However, the modern digital iteration was pioneered by Edgar Codd in 1970. Codds primary contribution was the principle of data independence, which strictly separates the logical representation of information from its physical implementation on storage devices. Before this shift, programmers were often required to understand the underlying physical structure of the data to perform even basic queries. The relational model replaced these complex, system-dependent methods with a high-level abstraction based on tables, which are referred to as relations. The relational model serves as the theoretical cornerstone of modern database systems, providing a structured yet flexible framework for data management. Proposed by Edgar Codd in 1970, this model revolutionized the field by introducing the principle of data independence. This principle decouples the logical representation of data—how users perceive and interact with it—from its physical storage on hardware. By representing information through intuitive two-dimensional tables, the model bridges the gap between complex mathematical theory and practical business applications. Interestingly, the tabular format is not a modern invention; historical evidence shows that humans have used clay tablets for relational data organization since at least 1800 BC. This enduring utility underscores the model's alignment with human cognitive patterns for managing structured facts.
In this model, data is represented as a collection of two-dimensional structures. This approach offers simplicity and versatility, allowing for anything from corporate records to scientific data to be modeled effectively. By restricting operations to a limited set of high-level queries, the relational model allows for significant optimization by the database management system, often performing tasks more efficiently than code written in general-purpose languages. This chapter details the structure, mathematical foundations, and design theories—specifically functional dependencies and normalization—that ensure data remains consistent and free from redundancy. \thm{Data Independence}{The separation of the logical data model from the physical storage implementation, allowing changes to the machine-level storage without affecting user queries or the logical view of the data.}
\section{Core Terminology and Structural Components} \section{Core Terminology and Structural Components}
The architecture of the relational model is defined by a specific set of terms that describe both the structure and the content of the data. In relational theory, specific terminology is used to describe the components of a database, often with synonyms used across different technical and business contexts. The primary structure is the relation, commonly referred to as a table. A relation consists of a set of attributes, which are the named columns that define the properties of the data stored. The set of these attributes, combined with the name of the relation itself, constitutes the relation schema.
\dfn{Attribute}{ \dfn{Relation Schema}{The formal description of a relation, comprising its name and a set of attributes, typically denoted as $R(A_1, A_2, \dots, A_n)$.}
An attribute is a named header for a column in a relation. It describes the meaning of the entries within that column. For example, in a table tracking information about movies, "title" and "year" would be typical attributes.
}
\dfn{Tuple}{ Each entry within a relation is called a tuple, which corresponds to a row in a table or a record in a file. A tuple contains a specific value for each attribute defined in the schema. These values, often called scalars, represent individual facts or characteristics.
A tuple is a single row in a relation, excluding the header row. It represents a specific instance of the entity described by the relation. A tuple contains one component for every attribute defined in the relation's schema.
}
\dfn{Relation Schema}{ \dfn{Tuple}{A single row or record within a relation, representing a specific instance of the entity or business object described by the schema.}
A relation schema consists of the name of the relation and the set of attributes associated with it. This is typically expressed as $R(A_1, A_2, \dots, A_n)$. A database schema is the total collection of all relation schemas within a system.
}
\dfn{Relation Instance}{ \nt{While mathematicians prefer to index tuples by numbers, database scientists identify components by their attribute names to provide semantic clarity.}
A relation instance is the specific set of tuples present in a relation at any given time. While schemas are relatively static, instances change frequently as data is inserted, updated, or deleted.
}
The components of a tuple must be atomic, meaning they are elementary types like integers or strings. The model explicitly forbids complex structures such as nested lists or sets as individual values. Every attribute is associated with a domain, which defines the set of permissible values or the specific data type for that column. \section{Domains and Atomic Values}
Every attribute in a relation is associated with a domain. A domain is essentially a data type or a set of permissible values that can appear in a specific column. For example, a "year" attribute might be restricted to the domain of integers, while a "name" attribute is restricted to the domain of character strings. A fundamental requirement of the standard relational model is that these values must be atomic. This means they cannot be further decomposed into smaller components, such as nested tables, lists, or sets. This requirement is formally known as the First Normal Form.
\dfn{Domain}{A set of values of a specific elementary type from which an attribute draws its components.}
\thm{Atomic Integrity}{The rule that every component of every tuple must be an indivisible, elementary value rather than a structured or repeating group.}
\section{Mathematical Foundations of Relations} \section{Mathematical Foundations of Relations}
Mathematically, a relation is defined as a subset of the Cartesian product of the domains of its attributes. If an attribute $A$ has a domain $D$, then the entries in the column for $A$ must be elements of $D$. A record can be viewed as a partial function or a "map" from the set of attribute names to a set of atomic values. The relational model is built upon the mathematical concept of the Cartesian product. Given a family of domains $D_1, D_2, \dots, D_n$, a relation is defined as a subset of the Cartesian product $D_1 \times D_2 \times \dots \times D_n$. Each element of this subset is an $n$-tuple. This mathematical approach ensures that domain integrity and relational integrity are maintained by definition, as every value must belong to its prescribed set.
\thm{Relation as a Set}{ \dfn{Cartesian Product}{The set of all possible ordered tuples that can be formed by taking one element from each of the participating sets or domains.}
In the abstract mathematical model, a relation is a set of tuples. This implies that the order of the rows is irrelevant and that every tuple must be unique. Furthermore, because attributes are a set, the order of columns does not change the identity of the relation, provided the components of the tuples are reordered to match.
}
While the theoretical model relies on set semantics, practical implementations often utilize alternate semantics: An alternative mathematical representation views a record as a map. In this perspective, a record $t$ is a partial function from a set of attribute names to a global set of values. This mapping approach is often preferred because it makes the order of attributes irrelevant, reflecting how databases actually operate in practice.
\begin{itemize}
\item \textbf{Bag Semantics}: Used by SQL, this allows for duplicate records within a table.
\item \textbf{List Semantics}: In this variation, the specific sequence of the records is preserved and carries meaning.
\end{itemize}
A database is formally defined as a set of these relational tables. To interact with this data, the model employs relational algebra, a system of operators that take one or more relations as input and produce a new relation as output. \nt{In a relation, the order of both the attributes and the tuples is immaterial; a relation remains the same regardless of how its rows or columns are permuted.}
\section{Integrity Constraints and Consistency} \section{Integrity and Consistency Rules}
To ensure the validity of data, the relational model enforces several categories of integrity. For a collection of data to be considered a valid relational table, it must adhere to three primary integrity rules. These rules ensure the consistency and predictability of the data.
\thm{Relational Integrity}{
The requirement that every record within a specific relation must possess the exact same set of attributes. Broken relational integrity occurs if attributes are missing or if redundant attributes appear in individual rows.
}
\thm{Atomic Integrity}{
Also known as the First Normal Form (1NF), this rule dictates that every value in a cell must be a single, indivisible unit. Complex data types cannot be stored within a single attribute field.
}
\thm{Domain Integrity}{
This constraint requires that every value for an attribute must belong to the predefined set of values or the data type associated with its domain.
}
\section{Defining Relation Schemas in SQL}
SQL (Structured Query Language) is the primary tool for implementing the relational model. It is divided into the Data-Definition Language (DDL) for creating and modifying schemas, and the Data-Manipulation Language (DML) for querying and updating data. The most fundamental command in DDL is the \texttt{CREATE TABLE} statement, which establishes the table name, its attributes, and their types.
\subsection{SQL Data Types}
Attributes must be assigned a primitive type. Common SQL types include:
\begin{itemize}
\item \textbf{CHAR(n)}: A fixed-length string of $n$ characters.
\item \textbf{VARCHAR(n)}: A variable-length string up to $n$ characters.
\item \textbf{INT / INTEGER}: Standard whole numbers.
\item \textbf{FLOAT / REAL}: Floating-point numbers.
\item \textbf{BOOLEAN}: Stores TRUE, FALSE, or UNKNOWN.
\item \textbf{DATE / TIME}: Specific formats for calendar dates (e.g., YYYY-MM-DD) and clock times.
\end{itemize}
\subsection{Keys and Uniqueness}
\dfn{Key}{
A key is a set of one or more attributes such that no two tuples in any possible relation instance can share the same values for all these attributes. A key must be minimal; no subset of its attributes can also be a key.
}
In SQL, keys are declared using the \texttt{PRIMARY KEY} or \texttt{UNIQUE} keywords. Attributes designated as a primary key are forbidden from containing NULL values, whereas \texttt{UNIQUE} columns may allow them depending on the system.
\section{Functional Dependencies}
A central concept in database design theory is the functional dependency (FD), which generalizes the idea of a key.
\thm{Functional Dependency}{
A functional dependency on a relation $R$ is an assertion that if two tuples agree on a set of attributes $A_1, \dots, A_n$, they must also agree on another set of attributes $B_1, \dots, B_m$. This is written as $A \rightarrow B$.
}
FDs are not merely observations about a specific instance of data but are constraints that must hold for every possible legal instance of the relation. They describe the relationships between attributes; for example, a movie's title and year might functionally determine its length and studio, as there is only one specific length and studio for a unique movie released in a given year.
\dfn{Superkey}{
A superkey is a set of attributes that contains a key as a subset. Therefore, every superkey functionally determines all attributes of the relation, but it may not be minimal.
}
The closure of a set of attributes under a set of FDs is the collection of all attributes that are functionally determined by that set. Calculating the closure allows designers to identify all keys of a relation and test if a new FD follows from the existing ones.
\section{Anomalies and the Need for Decomposition}
Careless schema design leads to "anomalies," which are problems that occur when too much information is crammed into a single table. There are three primary types:
\begin{enumerate} \begin{enumerate}
\item \textbf{Redundancy}: Information is repeated unnecessarily across multiple rows (e.g., repeating a studio's address for every movie they made). \item \textbf{Relational Integrity:} This requires that all records within a specific table have the exact same set of attributes. A table cannot have "holes" or missing attributes in some rows but not others.
\item \textbf{Update Anomalies}: If a piece of redundant information changes, it must be updated in every row. Failure to do so leads to inconsistent data. \item \textbf{Atomic Integrity:} As previously noted, this prohibits the nesting of structures within a cell. A value must be a single fact.
\item \textbf{Deletion Anomalies}: Deleting a record might inadvertently destroy the only copy of unrelated information (e.g., deleting the last movie of a studio might remove the studio's address from the database entirely). \item \textbf{Domain Integrity:} This ensures that every value in a column is of the same kind, matching the type specified for that attribute in the schema.
\end{enumerate} \end{enumerate}
To eliminate these issues, designers use decomposition—the process of splitting a relation into two or more smaller relations whose attributes, when combined, include all the original attributes. \dfn{Domain Integrity}{The constraint that every value in a specific column must belong to the domain (data type) associated with that attribute.}
\section{Normal Forms} \thm{Relational Integrity}{The requirement that every record in a relation must possess the same support, meaning they all share the identical set of attributes defined in the schema.}
The goal of decomposition is to reach a normal form that guarantees the absence of certain anomalies. \section{Keys and Uniqueness}
\thm{Boyce-Codd Normal Form (BCNF)}{ To distinguish between tuples, the relational model relies on the concept of keys. A key is a set of one or more attributes that uniquely identifies a tuple within a relation instance. No two tuples in a valid relation can share the same values for all attributes in the key. Typically, one key is designated as the primary key.
A relation $R$ is in BCNF if and only if for every nontrivial functional dependency $A \rightarrow B$, the set of attributes $A$ is a superkey. In simpler terms, every determinant must be a key.
}
Any relation can be decomposed into a collection of BCNF relations. This process effectively removes redundancy caused by functional dependencies. However, while BCNF is very powerful, it does not always preserve all original dependencies. This leads to the use of a slightly relaxed condition. \dfn{Primary Key}{A specific attribute or minimal set of attributes chosen to uniquely identify each tuple in a relation, often indicated in a schema by underlining the attributes.}
\thm{Third Normal Form (3NF)}{ \nt{Identifying a primary key is essential for establishing relationships between different tables and maintaining data accuracy.}
A relation $R$ is in 3NF if for every nontrivial FD $A \rightarrow B$, either $A$ is a superkey, or every attribute in $B$ that is not in $A$ is "prime" (a member of some key).
}
3NF is useful because it is always possible to find a decomposition that is both lossless (the original data can be reconstructed) and dependency-preserving, which is not always true for BCNF. \section{Relation Instances and Temporal Change}
\section{Modifying and Removing Schemas} A relation is not a static object; it changes over time as tuples are inserted, deleted, or updated. The set of tuples present in a relation at any given moment is called an instance. Standard database systems typically only maintain the "current instance," representing the data as it exists right now. Changing a schema (adding or deleting columns) is a much more significant and expensive operation than changing an instance, as it requires restructuring every tuple currently stored.
Database structures are dynamic. SQL provides the \texttt{DROP TABLE} command to remove a relation and all its data permanently. For structural changes, the \texttt{ALTER TABLE} command is used. This allows for the addition of new attributes via \texttt{ADD} or the removal of existing ones via \texttt{DROP}. When new columns are added, existing tuples typically receive a \texttt{NULL} value or a specified \texttt{DEFAULT} value. \dfn{Relation Instance}{The specific set of tuples contained within a relation at a given point in time.}
\section{Alternative Storage Semantics}
While the classical relational model is based on set semantics, where duplicate tuples are strictly forbidden, practical implementations often utilize different semantics based on the needs of the system.
\begin{enumerate}
\item \textbf{Set Semantics:} No duplicate records are allowed.
\item \textbf{Bag Semantics:} Duplicate records are permitted. This is common in SQL results, as eliminating duplicates is computationally expensive.
\item \textbf{List Semantics:} The specific order of the records is preserved and significant.
\end{enumerate}
\thm{Bag Semantics}{A variation of the relational model where duplicate tuples are allowed to exist within a relation, often used to improve the efficiency of query operations.}
\nt{The choice between set, bag, and list semantics is often a trade-off between mathematical purity and the performance requirements of a real-world database engine.}
\section{Conclusion}
The relational model's power lies in its simplicity and its firm mathematical grounding. By treating data as a collection of relations and providing a clear set of integrity rules, it allows for the creation of robust, scalable information systems. The use of schemas provides a stable contract for applications, while the principle of data independence ensures that the system can evolve technologically without breaking the logical structures that users depend on.
\nt{The relational model effectively acts as the "physics" of data, providing the laws that govern how digital information is structured and transformed.}

View File

@@ -1,72 +1,90 @@
\chapter{Transactions and the Three Tiers} \chapter{Transactions and the Three Tiers}
The evolution of data management has shifted from localized, single-machine installations to complex, multi-tiered architectures that support massive user bases across the globe. This chapter explores the foundational structures of modern information systems, specifically focusing on how databases operate within a server environment. We examine the interaction between various layers of processing, known as the three-tier architecture, and the logical organization of data into environments, clusters, catalogs, and schemas. Furthermore, we investigate the mechanisms that allow general-purpose programming languages, such as Java, to interact with SQL through call-level interfaces like JDBC. Central to this discussion is the management of transactions, which ensure that even in highly concurrent and distributed settings, the integrity and consistency of data are maintained through the adherence to the ACID properties and the management of isolation levels. Modern database systems do not operate in isolation; they are embedded within complex multi-tier architectures designed to handle thousands of concurrent users. At the heart of this ecosystem is the concept of a transaction, a logical unit of work that ensures data integrity despite system failures or overlapping user actions. To maintain this integrity, databases adhere to the ACID properties—Atomicity, Consistency, Isolation, and Durability. This chapter explores the three-tier architecture that connects users to data, the hierarchical structure of the SQL environment, and the rigorous mechanics of transaction management, including isolation levels and locking protocols such as Two-Phase Locking (2PL).
\dfn{Database Management System}{A specialized software system designed to create, manage, and provide efficient, safe, and persistent access to large volumes of data over long periods of time.}
\section{The Three-Tier Architecture} \section{The Three-Tier Architecture}
Modern large-scale database installations typically utilize a three-tier or three-layer architecture. This structure is designed to separate different functional concerns, which allows for better scalability, security, and maintenance. Large-scale database installations typically utilize a three-tier architecture to separate concerns and improve scalability. This organization allows different components of the system to run on dedicated hardware, optimizing performance for each specific task.
\thm{Three-Tier Architecture}{A system organization that distinguishes between three interacting layers: the Web Server tier (user interface), the Application Server tier (business logic), and the Database Server tier (data management).} \dfn{Three-Tier Architecture}{A system organization consisting of three distinct layers: the Web Server tier for user interaction, the Application Server tier for processing logic, and the Database Server tier for data management.}
The first layer is the Web-Server Tier. This tier manages the primary interaction with the user, often through the Internet. When a customer accesses a service, a web server responds to the initial request and presents the interface, such as an HTML page with forms and menus. The client's browser handles the user's input and transmits it back to the web server, which then communicates with the application tier. The first tier consists of **Web Servers**. These processes act as the entry point for clients, usually interacting via a web browser over the Internet. When a user enters a URL or submits a form, the browser sends an HTTP (Hypertext Transfer Protocol) request to the web server. The web server is responsible for returning an HTML page, which may include images and other data to be displayed to the user.
The middle layer is the Application-Server Tier. This is where the "business logic" of an organization resides. The responsibility of this tier is to process requests from the web server by determining what data is needed and how it should be presented. In complex systems, this tier might be divided into subtiers, such as one for object-oriented data handling or another for information integration, where data from multiple disparate sources is combined. The application tier performs the heavy lifting of turning raw database information into a meaningful response for the end user. \nt{Common web server software includes Apache and Tomcat, which are frequently used in both professional and academic environments to bridge the gap between web browsers and database systems.}
The final layer is the Database-Server Tier. This layer consists of the processes that run the Database Management System (DBMS). It receives query and modification requests from the application tier and executes them against the stored data. To ensure efficiency, this tier often maintains a pool of open connections that can be shared among various application processes, avoiding the overhead of constantly opening and closing connections. The second tier is the **Application Server**, often referred to as the **Business Logic** layer. This is where the core functionality of the system resides. When the web server receives a request that requires data, it communicates with the application tier. Programmers use languages such as Java, Python, C++, or PHP to write the logic that decides how to respond to user requests. This layer is responsible for generating SQL queries, sending them to the database, and formatting the returned results into a programmatically built HTML page or other responses.
\section{The SQL Environment and Its Logical Organization} The third tier is the **Database Server**. These are the processes running the Database Management System (DBMS), such as PostgreSQL or MySQL. This tier executes the queries requested by the application tier, manages data persistence on disk, and ensures that the system remains responsive through buffering and connection management.
The SQL environment provides the framework within which data exists and operations are executed. This environment is organized into a specific hierarchy to manage terminology and scope. \section{The SQL Environment}
\dfn{SQL Environment}{The overall framework, typically an installation of a DBMS at a specific site, under which database elements are defined and SQL operations are performed.} Within the database tier, data is organized in a hierarchical framework known as the SQL environment. This structure allows for a clear namespace and organizational scope for all database elements.
At the top of this hierarchy is the Cluster. A cluster is a collection of catalogs and represents the maximum scope over which a single database operation can occur. Essentially, it is the entire database as perceived by a specific user. \dfn{SQL Environment}{The overall framework under which database elements exist and SQL operations are executed, typically representing a specific installation of a DBMS.}
Below the cluster is the Catalog. Catalogs are used to organize schemas and provide a unique naming space. Each catalog must contain a special schema that holds information about all other schemas within that catalog. The hierarchy begins with the **Cluster**, which represents the maximum scope for a database operation and the set of all data accessible to a particular user. Within a cluster, data is organized into **Catalogs**. A catalog is the primary unit for supporting unique terminology and contains one or more **Schemas**.
The most basic unit of organization is the Schema. A schema is a collection of database elements such as tables, views, triggers, and assertions. One can create a schema using a specific declaration or modify it over time. A schema is a collection of database objects, including tables, views, triggers, and assertions. In professional environments, a full name for a table might look like `CatalogName.SchemaName.TableName`. If the catalog or schema is not explicitly specified, the system defaults to the current session's settings (e.g., `public` is often the default schema).
\section{Establishing Connections and Sessions} \thm{The Concept of Sessions}{A session is the period during which a connection between a SQL client and a SQL server is active, encompassing a sequence of operations performed under a specific authorization ID.}
For a program or a user to interact with the database server, a link must be established. This is handled through connections and sessions. A connection is the physical or logical link between a SQL client (often the application server) and a SQL server. A user can open multiple connections, but only one can be active at any given moment. \section{Fundamentals of Transactions}
\dfn{Session}{The sequence of SQL operations performed while a specific connection is active. It includes state information such as the current catalog, current schema, and the authorized user.} A transaction is a single execution of a program or a batch of queries that must be treated as an indivisible unit. The goal of the transaction manager is to ensure that even if the system crashes or multiple users access the same record, the result remains correct.
When a connection is established, it usually requires an authorization clause, which includes a username and password. This ensures that the current authorization ID has the necessary privileges to perform the requested actions. In this context, a "Module" refers to the application program code, while a "SQL Agent" is the actual execution of that code. \dfn{Transaction}{A collection of one or more database operations, such as reads and writes, that are grouped together to be executed atomically and in isolation from other concurrent actions.}
\section{Transactions and the ACID Properties} To be considered reliable, every transaction must satisfy the **ACID** test. These four properties are the cornerstone of database design theory.
Transactions are the fundamental units of work in a database system. To ensure that the database remains in a consistent state despite concurrent access or system failures, every transaction must follow a set of requirements known as the ACID properties. \thm{ACID Properties}{
\begin{itemize}
\item \textbf{Atomicity:} Often described as "all-or-nothing," this ensures that a transaction is either fully completed or not executed at all. If a failure occurs halfway through, any partial changes must be undone.
\item \textbf{Consistency:} A transaction must take the database from one consistent state to another, satisfying all integrity constraints like primary keys and check constraints.
\item \textbf{Isolation:} Each transaction should run as if it were the only one using the system, regardless of how many other users are active.
\item \textbf{Durability:} Once a transaction has been committed, its effects must persist in the database even in the event of a power outage or system crash.
\end{itemize}}
\thm{ACID Properties}{A set of four essential characteristics of a transaction: Atomicity (all-or-nothing execution), Consistency (preserving database invariants), Isolation (executing as if in isolation), and Durability (permanent storage of results).} \nt{Atomicity in transactions should not be confused with atomic values in First Normal Form. In this context, it refers to the indivisibility of the execution process itself.}
Atomicity ensures that if a transaction is interrupted, any partial changes are rolled back, leaving the database as if the transaction never started. Consistency guarantees that a transaction moves the database from one valid state to another, respecting all defined rules. Isolation is managed by a scheduler to ensure that the concurrent execution of multiple transactions results in a state that could have been achieved if they were run one after another. Finally, Durability ensures that once a transaction is committed, its effects will survive even a subsequent system crash. \section{Concurrency and Isolation Levels}
\subsection{Transactional Phenomena and Isolation Levels} When multiple transactions run at the same time, their actions may interleave in a way that leads to inconsistencies. A **Schedule** is the actual sequence of actions (reads and writes) performed by these transactions. While a **Serial Schedule** (running one transaction after another) is always safe, it is inefficient. Schedulers instead aim for **Serializability**.
When multiple transactions run simultaneously, several problematic phenomena can occur if isolation is not strictly enforced. \thm{Serializability}{A schedule is serializable if its effect on the database is identical to the effect of some serial execution of the same transactions.}
1. \textbf{Dirty Read}: This happens when one transaction sees data that has been written by another transaction but has not yet been committed. If the first transaction eventually aborts, the data seen by the second transaction effectively never existed. If isolation is not properly managed, several types of "anomalies" can occur. These phenomena describe undesirable interactions between concurrent processes.
2. \textbf{Nonrepeatable Read}: A transaction reads the same data twice but finds different values because another transaction modified and committed that data in the meantime.
3. \textbf{Phantom Read}: A transaction runs a query multiple times and finds "phantom" rows that were inserted by another committed transaction during the process.
4. \textbf{Serialization Anomaly}: This occurs when the result of a group of concurrent transactions is inconsistent with any serial ordering of those same transactions.
To manage these risks, SQL defines various "Isolation Levels." The most stringent is "Serializable," which prevents all the aforementioned phenomena. Lower levels, such as "Read Committed" or "Read Uncommitted," allow for higher concurrency at the risk of encountering some of these issues. \dfn{Dirty Read}{A situation where one transaction reads data that has been modified by another transaction but has not yet been committed. If the first transaction subsequently aborts, the second transaction has based its work on data that "never existed."}
\subsection{Java Database Connectivity (JDBC)} \dfn{Non-repeatable Read}{Occurs when a transaction reads the same data element twice but finds different values because another transaction modified and committed that element in the interim.}
One of the most common ways to implement the application tier is through Java, using the JDBC call-level interface. JDBC allows a Java program to interact with virtually any SQL database by using a standard set of classes and methods. \dfn{Phantom Read}{A phenomenon where a transaction runs a query to find a set of rows, but upon repeating the query, finds additional "phantom" rows that were inserted and committed by a concurrent transaction.}
\dfn{JDBC}{A Java-based API that provides a standard library of classes for connecting to a database, executing SQL statements, and processing the results.} SQL provides four **Isolation Levels** that allow developers to trade off strictness for performance.
The process begins by loading a driver for the specific DBMS, such as MySQL or PostgreSQL. Once the driver is loaded, a connection is established using a URL that identifies the database, along with credentials for authorization. \begin{itemize}
\item \textbf{Read Uncommitted:} The most relaxed level; allows dirty reads.
\item \textbf{Read Committed:} Forbids dirty reads but allows non-repeatable reads.
\item \textbf{Repeatable Read:} Forbids dirty and non-repeatable reads but may allow phantoms.
\item \textbf{Serializable:} The strictest level; ensures the result is equivalent to some serial order.
\end{itemize}
In JDBC, there are different types of statements used to interact with the data. A simple `Statement` is used for queries without parameters, while a `PreparedStatement` is used when a query needs to be executed multiple times with different values. These parameters are denoted by question marks in the SQL string and are bound to specific values before execution. \section{Locking and Two-Phase Locking (2PL)}
The result of a query in JDBC is returned as a `ResultSet` object. This object acts like a cursor, allowing the program to iterate through the resulting tuples one at a time using a `next()` method. For each tuple, the programmer uses specific getter methods, such as `getInt()` or `getString()`, to extract data based on the attribute's position in the result. The most common way for a database to enforce serializability is through the use of **Locks**. Before a transaction can read or write a piece of data, it must obtain a lock on that element. These are managed via a **Lock Table** in the scheduler.
\thm{JDBC Interaction Pattern}{The standard flow of database access in Java: Load Driver $\rightarrow$ Establish Connection $\rightarrow$ Create Statement $\rightarrow$ Execute Query/Update $\rightarrow$ Process Results via ResultSet $\rightarrow$ Close Connection.} \dfn{Shared and Exclusive Locks}{A Shared (S) lock is required for reading and allows multiple transactions to read the same element. An Exclusive (X) lock is required for writing and prevents any other transaction from accessing that element.}
This interface effectively solves the "impedance mismatch" between the set-oriented world of SQL and the object-oriented world of Java. By providing a mechanism to fetch rows individually, it allows Java's iterative control structures to process data retrieved from SQL's relational queries. Furthermore, it supports the execution of updates, which encompass all non-query operations like insertions, deletions, and schema modifications. This robust framework is essential for building the business logic required in the application tier of the three-tier architecture.``` Simply using locks is not enough to guarantee serializability; the timing of when locks are released is critical. If a transaction releases a lock too early, another transaction might intervene and change the data, leading to a non-serializable schedule. To prevent this, systems use the **Two-Phase Locking (2PL)** protocol.
\thm{Two-Phase Locking (2PL)}{A protocol requiring that in every transaction, all locking actions must precede all unlocking actions. This creates two distinct phases: a "growing phase" where locks are acquired and a "shrinking phase" where they are released.}
\nt{Strict Two-Phase Locking is a variation where a transaction does not release any exclusive locks until it has committed or aborted. This prevents other transactions from reading dirty data and avoids the need for cascading rollbacks.}
A potential downside of locking is the risk of a **Deadlock**. This occurs when two or more transactions are stuck in a cycle, each waiting for a lock held by the other. Schedulers must be able to detect these cycles—often using a **Waits-For Graph**—and resolve them by aborting one of the transactions.
In conclusion, the management of transactions requires a deep integration of architectural tiers, hierarchical environments, and rigorous concurrency control. By utilizing ACID properties, various isolation levels, and the 2PL protocol, database systems provide a robust platform where users can safely interact with data as if they were the sole occupants of the system.
\nt{In practice, many developers use higher-level APIs like JDBC for Java or PHP's PEAR DB library to handle the complexities of database connections and transaction boundaries programmatically.}
To think of it another way, a transaction is like a single entry in a shared diary. Even if twenty people are writing in the same diary simultaneously, the system acts like a careful librarian, ensuring that each person's entry is written cleanly on its own line without anyone's ink smudging another's work.

View File

@@ -1,63 +1,121 @@
\chapter{Views and Indecies} \chapter{Views and Indecies}
This chapter explores the conceptual and physical layers of database management, focusing on the mechanisms that allow users to interact with data flexibly while ensuring that the underlying hardware performs at its peak. The discussion is divided into two primary concepts: views and indexes. The management of information within a relational database system involves a balance between logical abstraction and physical performance. While base tables provide the primary storage for data, they are not always optimized for the specific ways in which users interact with information. To bridge this gap, database systems utilize two critical components: \textbf{Views} and \textbf{Indices}. Views allow designers to create virtual relations that simplify complex queries and provide security by filtering access to specific attributes or rows. Indices, on the other hand, focus on the physical layer, providing specialized data structures that accelerate the retrieval of tuples without requiring exhaustive table scans. Together, these tools ensure that a database is both easy to use for the developer and efficient for the machine. This chapter explores the declaration and querying of virtual views, the limitations of updating them, the mechanics of indexing, and the mathematical models used to determine when an index provides a genuine performance benefit.
Virtual views represent a method of logical abstraction. They allow a database designer to present users with data organized in a way that is most convenient for their specific tasks, without necessarily altering the structure of the base tables where the information is physically stored. These virtual relations are computed on demand and provide a layer of data independence, protecting applications from changes in the underlying schema and offering a simplified interface for complex queries. \section{Virtual Views and Logical Abstraction}
On the physical side, indexes are specialized data structures used to circumvent the high cost of exhaustive table scans. By providing direct paths to specific tuples based on the values of search keys, indexes significantly reduce the number of disk accesses required for lookups and joins. However, the creation of an index is not a cost-free operation. It involves a fundamental trade-off between the acceleration of read operations and the increased overhead associated with insertions, deletions, and updates. This summary evaluates the criteria for view updatability, the mechanics of index implementation, and the rigorous cost models used to determine the optimal configuration of physical storage. A virtual view is a relation that does not exist as a separate entity on the physical disk but is instead defined by a query over one or more base tables. From the perspective of the user or the application layer, a view is indistinguishable from a standard table; it can be queried, joined with other relations, and used in subqueries.
\section{Virtual Views in a Relational Environment} \dfn{Virtual View}{A named relation defined by an expression or query that acts as a shortcut to data stored in other relations. It is persistent in definition but transient in representation, meaning its contents are recomputed or mapped back to base tables every time it is accessed.}
In a standard database, relations created through table declarations are considered persistent or "base" tables. These structures are physically stored on disk and remain unchanged unless modified by specific commands. In contrast, a virtual view is a relation defined by a SQL expression, typically a query. It does not exist in storage as a set of tuples; instead, its content is dynamically generated whenever it is referenced. The primary advantage of using views is \textbf{productivity}. Instead of repeating a complex subquery multiple times across different parts of an application, a developer can define that subquery as a view and refer to it by name.
\dfn{Virtual View}{A named virtual relation defined by a query over one or more existing base tables or other views, which is not physically materialized in the database.} \thm{The Interpretation of View Queries}{When a query refers to a virtual view, the query processor logically replaces the view name with its underlying definition. This effectively turns the query into a larger expression that operates directly on the base tables, ensuring that the view always reflects the most current state of the database.}
\thm{View Expansion}{The query processing mechanism whereby the name of a view in a SQL query is replaced by the query expression that defines it, allowing the system to optimize the operation as if it were performed directly on the base tables.} \nt{To declare a view in SQL, the \texttt{CREATE VIEW} statement is used followed by the keyword \texttt{AS} and a standard \texttt{SELECT-FROM-WHERE} block. If a developer wishes to change the column names presented by the view to be more descriptive or to avoid name collisions, they can list the new attribute names in parentheses immediately following the view name.}
When a view is defined, the system stores only its definition. From the perspective of a user, the view is indistinguishable from a base table. It possesses a schema and can be the target of queries. Furthermore, attributes in a view can be renamed during declaration to provide clearer identifiers for the end-user. This is particularly useful when the underlying table uses technical or ambiguous column names. For instance, a view might extract movie titles and production years from a comprehensive database to present a simplified list of films belonging to a specific studio. \section{Updatable Views and Modification Criteria}
\section{Modification and Update Logic for Views} While querying a view is straightforward, modifying one—through \texttt{INSERT}, \texttt{DELETE}, or \texttt{UPDATE}—presents a logical challenge. Since a view is virtual, any change must be translated into a corresponding change in the underlying base tables. SQL allows this only under specific, restrictive conditions to ensure that the translation is unambiguous.
While querying a view is straightforward, modifying one—through insertions, updates, or deletions—is conceptually complex because the view contains no physical tuples. For a modification to be successful, the database management system must be able to translate the request into an equivalent sequence of operations on the underlying base tables. \dfn{Updatable View}{A virtual view that the DBMS can modify by passing the changes through to the base relation. In standard SQL, this generally requires the view to be defined over a single relation and to include enough attributes so that a valid tuple can be formed in the underlying table.}
\dfn{Updatable View}{A virtual view that is sufficiently simple for the system to automatically map modifications back to the original base relations without ambiguity.} To be considered updatable without the help of triggers, a view typically must meet several criteria:
\begin{itemize}
\item The \texttt{FROM} clause must contain exactly one relation.
\item There can be no \texttt{DISTINCT} keyword, as this would make it impossible to determine which original tuple a change refers to.
\item The \texttt{WHERE} clause cannot use the relation itself in a subquery.
\item The \texttt{SELECT} list must include enough attributes to satisfy the \texttt{NOT NULL} and primary key constraints of the base table, or those omitted attributes must have default values.
\end{itemize}
\thm{Criteria for Updatability}{To be updatable, a view must generally be defined by a simple selection and projection from a single relation. It cannot involve duplicate elimination, aggregations, or group-by clauses, and it must include all attributes necessary to form a valid tuple in the base relation.} \nt{If an insertion is made into a view and the view projects out the attribute used in the \texttt{WHERE} clause, the new tuple might disappear from the view immediately after being added. This is because the underlying table receives the tuple with a \texttt{NULL} or default value that may not satisfy the views selection criteria.}
If a view is defined over multiple relations, such as through a join, it is typically not updatable because the logic for handling the change is not unique. For example, if a tuple is deleted from a view joining movies and producers, it is unclear whether the system should delete the movie, the producer, or both. To overcome these limitations, SQL provides "instead-of" triggers. These allow the designer to intercept a modification attempt on a view and define a custom set of actions to be performed on the base tables instead. This ensures that the intended semantics of the operation are preserved regardless of the complexity of the view's definition. \section{Instead-Of Triggers}
\section{Physical Indexes and Retrieval Performance} When a view is too complex to be automatically updatable (for instance, when it involve joins or aggregations), a database designer can use \textbf{Instead-Of Triggers}. These allow the programmer to explicitly define how a modification to a view should be handled by the system.
The efficiency of data retrieval is largely determined by the number of disk blocks the system must access. Without an index, the database must perform a full scan of a relation to find specific tuples. For large relations spanning thousands of blocks, this process is prohibitively slow. An index is a physical structure that maps values of a search key to the physical locations of the tuples containing those values. \thm{Instead-Of Trigger Principle}{An instead-of trigger intercepts a modification command intended for a view and executes a specified block of code in its place. This code usually involves custom logic to distribute the modification across multiple base tables or to calculate missing values.}
\dfn{Index}{A physical data structure designed to accelerate the location of tuples within a relation based on specified attribute values, bypassing the need for an exhaustive scan of all blocks.} \nt{By using the \texttt{REFERENCING NEW ROW AS} clause, the trigger can access the values the user attempted to insert into the view and use them as parameters for updates to the actual stored tables.}
\dfn{Multi-attribute Index}{An index built on a combination of two or more attributes, allowing the system to efficiently find tuples when values for all or a prefix of those attributes are provided in a query.} \section{Physical Storage and the Motivation for Indices}
Indexes are most commonly implemented as B-trees or hash tables. A B-tree is a balanced structure where every path from the root to a leaf is of equal length, ensuring predictable performance for both point lookups and range queries. In most modern systems, the B+ tree variant is used, where pointers to the actual data records are stored only at the leaf nodes. This structure allows the system to navigate through the index by comparing search keys, moving from a root block down to the appropriate leaf with minimal disk I/O. In a database without indices, the only way to find a specific record is through a \textbf{Full Scan}. This requires the system to read every block of the relation from the disk and check every tuple against the search condition. While this is feasible for small tables, it becomes a massive bottleneck as the data grows into millions or billions of rows.
\section{Selection and Performance Analysis of Indexes} \dfn{Index}{A supplementary data structure that associates specific values of one or more attributes (the search key) with pointers to the physical locations of the records containing those values. Its purpose is to allow the system to bypass irrelevant data and jump directly to the desired blocks.}
The decision of whether to build an index on a particular attribute requires a careful analysis of the expected workload. While an index speeds up queries, every modification to the underlying relation requires a corresponding update to the index. This secondary update involves reading and writing index blocks, which can double the cost of insertions and deletions. \thm{The I/O Model of Computation}{The cost of a database operation is primarily determined by the number of disk I/O actions it requires. Because moving data from the disk to main memory is orders of magnitude slower than CPU operations, the efficiency of a physical plan is measured by how many blocks must be read or written.}
\dfn{Clustering Index}{An index where the physical order of the tuples on disk corresponds to the order of the index entries, ensuring that all tuples with a specific key value are stored on as few blocks as possible.} \nt{It is important to distinguish between the \textbf{search key} of an index and the \textbf{primary key} of a relation. An index can be built on any attribute or set of attributes, regardless of whether they are unique.}
\thm{The Index Selection Trade-off}{The process of evaluating whether the time saved during the execution of frequent queries outweighs the time lost during the maintenance of the index for insertions, updates, and deletions.} \section{Clustered and Non-Clustered Indices}
To make this determination, database administrators use a cost model centered on disk I/O. If a relation is clustered on an attribute, the cost of retrieving all tuples with a specific value is approximately the number of blocks occupied by the relation divided by the number of distinct values of that attribute. If the index is non-clustering, each retrieved tuple may potentially reside on a different block, leading to a much higher retrieval cost. A tuning advisor or administrator will calculate the average cost of all anticipated operations (queries and updates) to decide which set of indexes minimizes the total weighted cost for the system. The relationship between the order of an index and the physical arrangement of tuples on the disk significantly impacts performance.
\section{Materialized Views and Automated Tuning} \dfn{Clustering Index}{An index where the physical order of the records in the data file matches or closely follows the order of the keys in the index. This ensures that all tuples with the same or similar key values are packed into the minimum possible number of blocks.}
Beyond virtual views and physical indexes, database systems often employ materialized views. Unlike a virtual view, a materialized view is physically computed and stored on disk. This approach is beneficial for high-complexity queries that are executed frequently, such as those involving expensive joins or aggregations in a data warehousing environment. \thm{Clustering Efficiency}{A clustering index is much more efficient for range queries than a non-clustering index. In a clustered scenario, the system can retrieve a range of values by reading consecutive blocks. In a non-clustered scenario, every matching tuple might reside on a different block, potentially requiring one disk I/O per tuple.}
\dfn{Materialized View}{A view whose query result is physically stored in the database, requiring an explicit maintenance strategy to synchronize its content with changes in the base tables.} \nt{A relation can only have one clustering index because the data can only be physically sorted in one way. However, it can have multiple non-clustering (secondary) indices.}
The use of materialized views introduces a maintenance cost similar to that of indexes. Every time a base table changes, the materialized view must be updated, either immediately or on a periodic schedule. Because the number of possible views is virtually infinite, modern systems use automated tuning advisors. These tools analyze a query log to identify representative workloads and then use a greedy algorithm to recommend the combination of indexes and materialized views that will provide the greatest overall benefit to the system's performance. \section{The Mechanics of B-Trees}
\section{Strategic Balance in Database Design} The most prevalent index structure in modern database systems is the \textbf{B-Tree}, specifically the B+ Tree variant. This structure is a balanced tree that automatically scales with the size of the data.
The successful implementation of a database requires a strategic balance between logical flexibility and physical efficiency. Virtual views provide the necessary abstraction to simplify application development and manage data security. Meanwhile, the careful selection of indexes and materialized views ensures that the system remains responsive as the volume of data grows. \dfn{B-Tree}{A balanced tree structure where every path from the root to a leaf is of equal length. Each node corresponds to a single disk block and contains a sorted list of keys and pointers to either child nodes or data records.}
By employing a formal cost model based on disk access times, designers can objectively evaluate the merits of different storage configurations. The goal is to reach a state where the most frequent and critical operations are prioritized, even if it necessitates a penalty for less common tasks. This continuous process of tuning and optimization is a hallmark of modern relational database management, allowing these systems to handle massive datasets while providing the illusion of instantaneous access to information. An index on a primary key, for example, is almost always beneficial because it is frequently queried and guarantees that only a single block needs to be retrieved to find a unique record. In contrast, an index on a frequently updated non-key attribute requires a more nuanced analysis to ensure it does not become a performance bottleneck. B-Trees are characterized by a parameter $n$, which defines the maximum number of keys a block can hold. The rules for a B-Tree include:
\begin{itemize}
\item \textbf{Internal Nodes:} Must have between $\lceil (n+1)/2 \rceil$ and $n+1$ children, except for the root, which can have as few as two.
\item \textbf{Leaves:} Hold the actual search keys and pointers to the records. They also include a pointer to the next leaf in sequence to facilitate range scans.
\end{itemize}
Ultimately, the choice of views and indexes defines the operational efficiency of the entire information system. A well-designed logical and physical schema acts as the foundation for scalable, high-performance applications, enabling efficient data exploration and robust transaction processing in even the most demanding environments.``` \thm{Logarithmic Search Complexity}{In a B-Tree, the number of steps required to find any specific key is proportional to the height of the tree. For a tree with $N$ records and a fan-out of $f$, the height is approximately $\log_f N$. Because $f$ is typically large (often hundreds of keys per block), even a billion records can be searched in just three or four disk accesses.}
\nt{B-Trees are dynamic; they grow by splitting nodes when they become too full and shrink by merging nodes when deletions leave them under-populated. This ensures that every block remains at least half-full, optimizing disk usage.}
\section{Hash Indices and Constant Time Lookups}
As an alternative to tree-structured indices, databases may use \textbf{Hash Indices}. These are based on a "smoothie machine" analogy: a deterministic function that turns any input into a seemingly random integer.
\dfn{Hash Index}{A structure that uses a hash function to map search keys into specific buckets. Each bucket corresponds to one or more disk blocks holding pointers to the relevant records.}
\thm{Constant Complexity}{A hash index provides $O(1)$ lookup time for equality queries. No matter how large the table becomes, the time to locate a specific key remains constant, as it requires only the computation of the hash and a direct jump to the indicated bucket.}
\nt{The major limitation of hash indices is their lack of support for range queries. Because the hash function randomizes the placement of keys, two keys that are close in value (e.g., 19 and 20) will likely end up in completely different parts of the index.}
\section{Index Selection and Cost Modeling}
Creating an index is not a "free" performance boost. Every index added to a relation imposes costs that must be weighed against its benefits. The decision process involves analyzing a query workload and estimating the average disk I/O.
\dfn{Index Creation Costs}{The costs associated with indices include the initial CPU and I/O time to build the structure, the additional disk space required to store it, and the "write penalty"—the fact that every \texttt{INSERT}, \texttt{DELETE}, or \texttt{UPDATE} to the base table must also update every associated index.}
\thm{The Selection Formula}{If $p$ is the probability that a query uses a certain attribute and $1-p$ is the probability of an update, an index on that attribute is beneficial only if the time saved during the queries outweighs the extra time spent on updates. This can be expressed as a linear combination of costs based on the parameters $B(R)$ (blocks) and $T(R)$ (tuples).}
\nt{In practice, many systems use an "automatic tuning advisor" that applies a greedy algorithm to suggest the best set of indices for a specific historical workload.}
\section{Indices and Complex Queries}
Indices are particularly powerful when dealing with joins or multiple selection criteria.
\thm{The Index-Join Strategy}{In a join $R \bowtie S$, if $S$ has an index on the join attribute, the system can iterate through $R$ and for each tuple, use the index on $S$ to find matching records. This is significantly faster than a nested-loop join if $R$ is small and the index on $S$ is efficient.}
\nt{When multiple indices are available for a single query, a technique called "pointer intersection" can be used. The system retrieves lists of pointers from several indices, intersects them in main memory, and only then reads the data blocks for the tuples that satisfy all conditions.}
\section{Information Retrieval and Inverted Indices}
A specialized form of indexing used for documents is the \textbf{Inverted Index}. This is the technology that powers web search engines and large-scale document repositories.
\dfn{Inverted Index}{A mapping from words (keywords) to the list of documents in which those words appear. Often, these lists include metadata such as the position of the word in the document or whether it appeared in a title or anchor tag.}
\nt{To optimize inverted indices, systems often use "stemming" (reducing words to their root form) and "stop words" (ignoring common words like "the" or "and" that do not help distinguish documents).}
\section{Summary of Design Principles}
The theory of views and indices suggests that database design is as much about managing the physical medium as it is about logical modeling.
\thm{The Table Universe and Consistency}{While views provide a filtered perspective of the data, they must remain consistent with the "table universe"—the set of all possible valid states of the database. Constraints and indices must be applied such that they hold true across all these potential states, ensuring that neither hardware failure nor concurrent access can corrupt the logical integrity of the system.}
\nt{Ultimately, the goal of indices is to turn linear or quadratic problems into logarithmic or constant ones. By carefully selecting which attributes to index based on the $B, T,$ and $V$ parameters, a designer can create a system that remains responsive even under the weight of massive datasets.}
In summary, views provide the necessary abstraction to keep application code clean and secure, while indices provide the surgical precision required to extract data from high-volume storage. A database is essentially a large library; a view is a specific bookshelf curated for a student, while an index is the card catalog that allows a librarian to find one specific page in a million volumes without having to read every book in the building.