From 86e3b5352e01e246e96a2b1be5f4ec937ac6ae59 Mon Sep 17 00:00:00 2001 From: Tanmay Date: Mon, 29 Aug 2022 14:56:56 +0100 Subject: [PATCH] Adds notes and code for index and window functions. --- database/code/generate_data.py | 49 ++++ database/code/models.py | 40 +++ database/code/pwiz.py | 227 +++++++++++++++++ database/notes/08-window-fuctions-indexes.md | 242 +++++++++++++++++++ 4 files changed, 558 insertions(+) create mode 100644 database/code/generate_data.py create mode 100644 database/code/models.py create mode 100644 database/code/pwiz.py create mode 100644 database/notes/08-window-fuctions-indexes.md diff --git a/database/code/generate_data.py b/database/code/generate_data.py new file mode 100644 index 0000000..33e0a22 --- /dev/null +++ b/database/code/generate_data.py @@ -0,0 +1,49 @@ +import argparse +import random +from typing import List +from urllib import request, error +import json +from models import Students + +API_URL = "https://randomuser.me/api" + + +def generate_user(count: int): + url = API_URL + f"?results={count}" + users = json.loads(request.urlopen(url).read()) + return users['results'] + +def to_student(user: dict) -> Students: + student = Students() + student.first_name = user['name']['first'] + student.last_name = user['name']['last'] + student.email = user['email'] + # random choice from batches 1 - 6 + student.batch = random.randint(1, 6) + student.address = f"{user['location']['city']}, {user['location']['state']}, {user['location']['country']}" + student.birth_date = user['dob']['date'].split("T")[0] + return student + +def main(count: int, filename: str) -> None: + with open(filename, "w") as f: + users = generate_user(count) + print(f"Generated {len(users)} users") + + students: List[Students] = [to_student(user) for user in users] + Students.bulk_create(students, batch_size=100) + print(f"Saved {len(students)} students") + +if __name__ == "__main__": + + # Declare arguments + parser = argparse.ArgumentParser(description="Student generator") + parser.add_argument( + "-n", "--number", type=int, default=10, help="Number of students to generate" + ) + parser.add_argument( + "-f", "--file", type=str, default="students.txt", help="File to write to" + ) + args = parser.parse_args() + + students_count, file_name = args.number, args.file + main(students_count, file_name) diff --git a/database/code/models.py b/database/code/models.py new file mode 100644 index 0000000..89d2cc9 --- /dev/null +++ b/database/code/models.py @@ -0,0 +1,40 @@ +from peewee import * +database = MySQLDatabase('jedi_academy', **{'charset': 'utf8', 'sql_mode': 'PIPES_AS_CONCAT', 'use_unicode': True, 'user': 'root'}) + +class UnknownField(object): + def __init__(self, *_, **__): pass + +class BaseModel(Model): + class Meta: + database = database + +class Instructors(BaseModel): + email = CharField() + first_name = CharField() + last_name = CharField() + phone = CharField(null=True) + + class Meta: + table_name = 'instructors' + +class Batches(BaseModel): + description = CharField(null=True) + instructor = ForeignKeyField(column_name='instructor_id', field='id', model=Instructors, null=True) + name = CharField() + start_date = DateField() + + class Meta: + table_name = 'batches' + +class Students(BaseModel): + address = CharField(null=True) + batch = ForeignKeyField(column_name='batch_id', field='id', model=Batches, null=True) + birth_date = DateField(null=True) + email = CharField() + first_name = CharField() + iq = IntegerField(null=True) + last_name = CharField() + phone = CharField(null=True) + + class Meta: + table_name = 'students' diff --git a/database/code/pwiz.py b/database/code/pwiz.py new file mode 100644 index 0000000..af4b58c --- /dev/null +++ b/database/code/pwiz.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python + +import datetime +import os +import sys +from getpass import getpass +from optparse import OptionParser + +from peewee import * +from peewee import print_ +from peewee import __version__ as peewee_version +from playhouse.cockroachdb import CockroachDatabase +from playhouse.reflection import * + + +HEADER = """from peewee import *%s +database = %s('%s'%s) +""" + +BASE_MODEL = """\ +class BaseModel(Model): + class Meta: + database = database +""" + +UNKNOWN_FIELD = """\ +class UnknownField(object): + def __init__(self, *_, **__): pass +""" + +DATABASE_ALIASES = { + CockroachDatabase: ['cockroach', 'cockroachdb', 'crdb'], + MySQLDatabase: ['mysql', 'mysqldb'], + PostgresqlDatabase: ['postgres', 'postgresql'], + SqliteDatabase: ['sqlite', 'sqlite3'], +} + +DATABASE_MAP = dict((value, key) + for key in DATABASE_ALIASES + for value in DATABASE_ALIASES[key]) + +def make_introspector(database_type, database_name, **kwargs): + if database_type not in DATABASE_MAP: + err('Unrecognized database, must be one of: %s' % + ', '.join(DATABASE_MAP.keys())) + sys.exit(1) + + schema = kwargs.pop('schema', None) + DatabaseClass = DATABASE_MAP[database_type] + db = DatabaseClass(database_name, **kwargs) + return Introspector.from_database(db, schema=schema) + +def print_models(introspector, tables=None, preserve_order=False, + include_views=False, ignore_unknown=False, snake_case=True): + database = introspector.introspect(table_names=tables, + include_views=include_views, + snake_case=snake_case) + + db_kwargs = introspector.get_database_kwargs() + header = HEADER % ( + introspector.get_additional_imports(), + introspector.get_database_class().__name__, + introspector.get_database_name(), + ', **%s' % repr(db_kwargs) if db_kwargs else '') + print_(header) + + if not ignore_unknown: + print_(UNKNOWN_FIELD) + + print_(BASE_MODEL) + + def _print_table(table, seen, accum=None): + accum = accum or [] + foreign_keys = database.foreign_keys[table] + for foreign_key in foreign_keys: + dest = foreign_key.dest_table + + # In the event the destination table has already been pushed + # for printing, then we have a reference cycle. + if dest in accum and table not in accum: + print_('# Possible reference cycle: %s' % dest) + + # If this is not a self-referential foreign key, and we have + # not already processed the destination table, do so now. + if dest not in seen and dest not in accum: + seen.add(dest) + if dest != table: + _print_table(dest, seen, accum + [table]) + + print_('class %s(BaseModel):' % database.model_names[table]) + columns = database.columns[table].items() + if not preserve_order: + columns = sorted(columns) + primary_keys = database.primary_keys[table] + for name, column in columns: + skip = all([ + name in primary_keys, + name == 'id', + len(primary_keys) == 1, + column.field_class in introspector.pk_classes]) + if skip: + continue + if column.primary_key and len(primary_keys) > 1: + # If we have a CompositeKey, then we do not want to explicitly + # mark the columns as being primary keys. + column.primary_key = False + + is_unknown = column.field_class is UnknownField + if is_unknown and ignore_unknown: + disp = '%s - %s' % (column.name, column.raw_column_type or '?') + print_(' # %s' % disp) + else: + print_(' %s' % column.get_field()) + + print_('') + print_(' class Meta:') + print_(' table_name = \'%s\'' % table) + multi_column_indexes = database.multi_column_indexes(table) + if multi_column_indexes: + print_(' indexes = (') + for fields, unique in sorted(multi_column_indexes): + print_(' ((%s), %s),' % ( + ', '.join("'%s'" % field for field in fields), + unique, + )) + print_(' )') + + if introspector.schema: + print_(' schema = \'%s\'' % introspector.schema) + if len(primary_keys) > 1: + pk_field_names = sorted([ + field.name for col, field in columns + if col in primary_keys]) + pk_list = ', '.join("'%s'" % pk for pk in pk_field_names) + print_(' primary_key = CompositeKey(%s)' % pk_list) + elif not primary_keys: + print_(' primary_key = False') + print_('') + + seen.add(table) + + seen = set() + for table in sorted(database.model_names.keys()): + if table not in seen: + if not tables or table in tables: + _print_table(table, seen) + +def print_header(cmd_line, introspector): + timestamp = datetime.datetime.now() + print_('# Code generated by:') + print_('# python -m pwiz %s' % cmd_line) + print_('# Date: %s' % timestamp.strftime('%B %d, %Y %I:%M%p')) + print_('# Database: %s' % introspector.get_database_name()) + print_('# Peewee version: %s' % peewee_version) + print_('') + + +def err(msg): + sys.stderr.write('\033[91m%s\033[0m\n' % msg) + sys.stderr.flush() + +def get_option_parser(): + parser = OptionParser(usage='usage: %prog [options] database_name') + ao = parser.add_option + ao('-H', '--host', dest='host') + ao('-p', '--port', dest='port', type='int') + ao('-u', '--user', dest='user') + ao('-P', '--password', dest='password', action='store_true') + engines = sorted(DATABASE_MAP) + ao('-e', '--engine', dest='engine', choices=engines, + help=('Database type, e.g. sqlite, mysql, postgresql or cockroachdb. ' + 'Default is "postgresql".')) + ao('-s', '--schema', dest='schema') + ao('-t', '--tables', dest='tables', + help=('Only generate the specified tables. Multiple table names should ' + 'be separated by commas.')) + ao('-v', '--views', dest='views', action='store_true', + help='Generate model classes for VIEWs in addition to tables.') + ao('-i', '--info', dest='info', action='store_true', + help=('Add database information and other metadata to top of the ' + 'generated file.')) + ao('-o', '--preserve-order', action='store_true', dest='preserve_order', + help='Model definition column ordering matches source table.') + ao('-I', '--ignore-unknown', action='store_true', dest='ignore_unknown', + help='Ignore fields whose type cannot be determined.') + ao('-L', '--legacy-naming', action='store_true', dest='legacy_naming', + help='Use legacy table- and column-name generation.') + return parser + +def get_connect_kwargs(options): + ops = ('host', 'port', 'user', 'schema') + kwargs = dict((o, getattr(options, o)) for o in ops if getattr(options, o)) + if options.password: + kwargs['password'] = getpass() + return kwargs + + +if __name__ == '__main__': + raw_argv = sys.argv + + parser = get_option_parser() + options, args = parser.parse_args() + + if len(args) < 1: + err('Missing required parameter "database"') + parser.print_help() + sys.exit(1) + + connect = get_connect_kwargs(options) + database = args[-1] + + tables = None + if options.tables: + tables = [table.strip() for table in options.tables.split(',') + if table.strip()] + + engine = options.engine + if engine is None: + engine = 'sqlite' if os.path.exists(database) else 'postgresql' + + introspector = make_introspector(engine, database, **connect) + if options.info: + cmd_line = ' '.join(raw_argv[1:]) + print_header(cmd_line, introspector) + + print_models(introspector, tables, options.preserve_order, options.views, + options.ignore_unknown, not options.legacy_naming) \ No newline at end of file diff --git a/database/notes/08-window-fuctions-indexes.md b/database/notes/08-window-fuctions-indexes.md new file mode 100644 index 0000000..5b18be1 --- /dev/null +++ b/database/notes/08-window-fuctions-indexes.md @@ -0,0 +1,242 @@ +# Window Functions and Indexes +## Agenda +- [Window Functions and Indexes](#window-functions-and-indexes) + - [Agenda](#agenda) + - [Window Functions](#window-functions) + - [Syntax](#syntax) + - [Rank function](#rank-function) + - [Row Number function](#row-number-function) + - [Other window functions](#other-window-functions) + - [Indexes](#indexes) + - [Types of scans](#types-of-scans) + - [Full table scans](#full-table-scans) + - [Full Index scan](#full-index-scan) + - [Index Range scan](#index-range-scan) + - [Index seek](#index-seek) + - [Some guidelines for optimizing MySQL queries](#some-guidelines-for-optimizing-mysql-queries) + +## Window Functions +MySQL has supported window functions since version 8.0. The window functions allow you to solve query problems in new, easier ways and with better performance. + +Window functions operate on a window frame, or a set of rows that are somehow related to the current row. They are similar to GROUP BY, because they compute aggregate values for a group of rows. However, unlike GROUP BY, they do not collapse rows; instead, they keep the details of individual rows. + +Let's see an example of a window function with our `Jedi Academy` database: + +```sql +CREATE TABLE `students` ( + `id` int NOT NULL AUTO_INCREMENT, + `first_name` varchar(255) NOT NULL, + `last_name` varchar(255) NOT NULL, + `email` varchar(255) NOT NULL, + `phone` varchar(255), + `birth_date` date, + `address` varchar(255), + `iq` int, + `batch_id` int, + PRIMARY KEY (`id`) +); +``` + +**How can I get the student names along with the number of students in their batch?** +We would have to use a subquery to get the number of students in each batch. + +```sql +SELECT + id, + first_name, + last_name, + batch_id, + (SELECT + COUNT(*) + FROM + students s2 + WHERE + s2.batch_id = s.batch_id) +FROM + students s; +``` + +Another way to do the same thing is to use a window function which partitions the data into batches and computes the number of students in each batch. + +```sql +SELECT + id, + first_name, + last_name, + batch_id, + COUNT(*) OVER (PARTITION BY batch_id) batch_students +FROM + students; +``` + +Like the aggregate functions with the GROUP BY clause, window functions also operate on a subset of rows but they do not reduce the number of rows returned by the query. +In this example, the SUM() function works as a window function that operates on a set of rows defined by the contents of the OVER clause. A set of rows to which the SUM() function applies is referred to as a window. + +### Syntax + +``` +window_function_name(expression) OVER ( + [partition_defintion] + [order_definition] + [frame_definition] +) +``` + +The partition_clause breaks up the rows into chunks or partitions. Two partitions are separated by a partition boundary. The window function is performed within partitions and re-initialized when crossing the partition boundary. You can specify one or more expressions in the PARTITION BY clause. Multiple expressions are separated by commas. + +The partition_clause syntax looks like the following: +``` +PARTITION BY [{,...}] +``` + +e.g. +``` +PARTITION BY batch_id +``` + +The `order_by_clause` has the following syntax: +``` +ORDER BY [ASC|DESC], [{,...}] +``` + +The ORDER BY clause specifies how the rows are ordered within a partition. It is possible to order data within a partition on multiple keys, each key is specified by an expression. Multiple expressions are also separated by commas. + +Similar to the PARTITION BY clause, the ORDER BY clause is also supported by all the window functions. However, it only makes sense to use the ORDER BY clause for order-sensitive window functions. + +### Rank function + +The RANK() function is used mainly to create reports. It computes the rank for each row in the result set in the order specified. + +The ranks are sequential numbers starting from 1. When there are ties (i.e., multiple rows with the same value in the column used to order), these rows are assigned the same rank. In this case, the rank of the next row will have skipped some numbers according to the quantity of the tied rows. For this reason, the values returned by RANK() are not necessarily consecutive numbers. + +**How can we get the students ranked by their IQs?** + +```sql +SELECT + id, + first_name, + last_name, + batch_id, + iq, + RANK() OVER(ORDER BY iq DESC) AS rank_number +FROM + students s; +``` + +After the RANK(), we have an OVER() clause with an ORDER BY. The ORDER BY is mandatory for ranking functions. Here, the rows are sorted in ascending order according to the column ranking_score. The order is ascending by default; you may use ASC at the end of the ORDER BY clause to clarify the ascending order, but it is not necessary. + +**How can we get the students ranked by their IQs in their batch?** +``` +SELECT + id, + first_name, + last_name, + batch_id, + iq, + RANK() OVER(PARTITION BY batch_id ORDER BY iq DESC) AS rank_number +FROM + students s; +``` + +### Row Number function +Another popular ranking function used in databases is ROW_NUMBER(). It simply assigns consecutive numbers to each row in a specified order. + +**How can we get the students ranked by their IQs using the row number function?** + +```sql +SELECT + id, + first_name, + last_name, + iq, + ROW_NUMBER() OVER(ORDER BY iq DESC) as rank_score +FROM + students s; +``` +The query first orders the rows by iq in descending order. It then assigns row numbers consecutively starting with 1. The rows with ties in ranking_score are assigned different row numbers, effectively ignoring the ties. + +### Other window functions +* [CUME_DIST](https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html#function_cume-dist) +* [DENSE_RANK](https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html#function_dense-rank) +* [FIRST_VALUE](https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html#function_first-value) +* [LAST_VALUE](https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html#function_last-value) +* [LEAD](https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html#function_lead) +* [LAG](https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html#function_lag) + +## Indexes + +Indexes are used to find rows with specific column values quickly. Without an index, MySQL must begin with the first row and then read through the entire table to find the relevant rows. The larger the table, the more this costs. If the table has an index for the columns in question, MySQL can quickly determine the position to seek to in the middle of the data file without having to look at all the data. This is much faster than reading every row sequentially. + +Most MySQL indexes (PRIMARY KEY, UNIQUE, INDEX, and FULLTEXT) are stored in B-trees. +MySQL uses indexes for these operations: +* To find the rows matching a WHERE clause quickly. +* To eliminate rows from consideration. If there is a choice between multiple indexes, MySQL normally uses the index that finds the smallest number of rows +* To retrieve rows from other tables when performing joins. MySQL can use indexes on columns more efficiently if they are declared as the same type and size. + +Normally, you create all indexes on a table at the time the table itself is created with CREATE TABLE. + +```sql +CREATE TABLE `students` ( + `id` int NOT NULL AUTO_INCREMENT, + `first_name` varchar(255) NOT NULL, + `last_name` varchar(255) NOT NULL, + `email` varchar(255) NOT NULL, + `phone` varchar(255), + `birth_date` date, + `address` varchar(255), + `iq` int, + `batch_id` int, + PRIMARY KEY (`id`), + INDEX(phone) +); +``` + +Or you can create an index and then add it to the table. +The syntax for creating an index is: +``` +CREATE INDEX [some_index] ON [some_table] ([some_column],[other_column]); +``` + +```sql +CREATE INDEX phone on students(phone) +``` + +You can also create a composite index using the same syntax. + +```sql +CREATE INDEX phone_email on students(phone, email); +``` + +### Types of scans +#### Full table scans +A full table scan (also known as a sequential scan) is a scan made on a database where each row of the table is read in a sequential (serial) order and the columns encountered are checked for the validity of a condition. Full table scans are usually the slowest method of scanning a table due to the heavy amount of I/O reads required from the disk which consists of multiple seeks as well as costly disk to memory transfers. + +#### Full Index scan + +If your table has a clustered index and you are firing a query that needs all or most of the rows i.e. query without WHERE or HAVING clause, then it uses an index scan. It works similar to the table scan, during the query optimization process, the query optimizer takes a look at the available index and chooses the best one, based on information provided in your joins and where clause, along with the statistical information database keeps. + +The main difference between a full table scan and an index scan is that because data is sorted in the index tree, the query engine knows when it has reached the end of the current it is looking for. It can then send the query, or move on to the next range of data as necessary + +#### Index Range scan +Index range scan is a common operation for accessing selective data. It can be bounded (bounded on both sides) or unbounded (on one or both sides). Data is returned in the ascending order of index columns. Multiple rows with identical values are sorted. + +#### Index seek +When your search criterion matches an index well enough that the index can navigate directly to a particular point in your data, that's called an index seek. It is the fastest way to retrieve data in a database. The index seeks are also a great sign that your indexes are being properly used. + +This happens when you specify a condition in WHERE clause like searching an employee by id or name if you have a respective index. + +## Some guidelines for optimizing MySQL queries +* Avoid using functions in predicates + ```sql + SELECT * FROM students where upper(phone) = '123'; + ``` + Because of the UPPER() function, the database doesn’t utilize the index on COL1. If there isn’t any way to avoid that function in SQL, you will have to create a new function-based index or have to generate custom columns in the database to improve performance. +* Avoid using a wildcard (%) at the beginning of a predicate + ```sql + SELECT * FROM students where phone like '%123'; + ``` + The wildcard causes a full table scan. +* Avoid unnecessary columns in SELECT clause + Instead of using ‘SELECT *’, always specify columns in the SELECT clause to improve MySQL performance. Because unnecessary columns cause additional load on the database, slowing down its performance as well whole systematic process. +* Pagination +