import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Notes#

Lecture notes for BB1000 (20034) vt25

Basics#

Numeric data types#

8 * 9 # expression: values and operators
72
8
8
type(8 * 9)
int
type(8 / 9)
float
8 / 9 
0.8888888888888888
8 // 9 # floor division
0
11 // 9
1
11 % 9  # modules , remainder
2

Complex number

\[ z = x + i*y \]

x: real part y: imaginary part

\[ i^2 = -1 \]
type(1j)
complex
1j * 1j
(-1+0j)
1j ** 2
(-1+0j)
True
True
False
False
type(True)
bool
True or False
True
True and False
False
123 % 2 == 0 # True for even numbers
False
124 % 2 == 0 # True for even numbers
True
1 + 2 == 3
True
0.1 + 0.2 == 0.3
False
0.1 + 0.2
0.30000000000000004

Strings#

'Hello world'
'Hello world'
type('Hello world')
str
"Hello world"
'Hello world'
'Hello' == "Hello"
True
'It's time to go'
  Cell In[27], line 1
    'It's time to go'
                    ^
SyntaxError: unterminated string literal (detected at line 1)
'It\'s time to go'  # here \' means the literal apostrophe
"It's time to go"
"It's time to go"
"It's time to go"
"It's time\nto go"  # \n refers to newline character
"It's time\nto go"
print("It's time\nto go")
It's time
to go
"It's time
to go"
  Cell In[32], line 1
    "It's time
    ^
SyntaxError: unterminated string literal (detected at line 1)
"""It's time
to go"""
"It's time\nto go"
# string operators
"Hello" + " world"
'Hello world'
"Hello" * " world"
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[35], line 1
----> 1 "Hello" * " world"

TypeError: can't multiply sequence by non-int of type 'str'
"Hello" * 3
'HelloHelloHello'
print("Hello world" + "\n" +  len("Hello world")*'_')
Hello world
___________
# datatypes are also functions that convert values from one type to another
type(int('77'))
int
# ask year of birth and report age
print("What year were you born?")
year_of_birth = input()
print(year_of_birth)
age = 2025 - int(year_of_birth)
print("This year you will be " + str(age) + " years old")
print("This year you will be", age, "years old")
print(f"This year you will be {age} years old")
What year were you born?
1961
This year you will be 64 years old
This year you will be 64 years old
This year you will be 64 years old

Lists#

colours = ['hearts', 'spades', 'diamonds', 'clubs']
values = [2, 3, 4, 5, 6, 7, 8, 9, 10, 'knight', 'queen', 'king', 'ace']
dir(colours)
['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']
colours + colours
['hearts',
 'spades',
 'diamonds',
 'clubs',
 'hearts',
 'spades',
 'diamonds',
 'clubs']
len(colours)
4
len(values)
13
colours[0] # the first elements
'hearts'
colours[3] 
'clubs'
colours[-1]
'clubs'
values.append('joker')
values
[2, 3, 4, 5, 6, 7, 8, 9, 10, 'knight', 'queen', 'king', 'ace', 'joker']
values.pop?
Signature: values.pop(index=-1, /)
Docstring:
Remove and return item at index (default last).

Raises IndexError if list is empty or index is out of range.
Type:      builtin_function_or_method
values.pop()
'joker'
values
[2, 3, 4, 5, 6, 7, 8, 9, 10, 'knight', 'queen', 'king', 'ace']
colours
['hearts', 'spades', 'diamonds', 'clubs']
colours.sort()
colours
['clubs', 'diamonds', 'hearts', 'spades']
colours.reverse()
colours
['spades', 'hearts', 'diamonds', 'clubs']

Tuples#

card1 = (colours[0], values[0])
card1
('spades', 2)
dir(card1)
['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'count',
 'index']
card1.index(2)
1

Slicing#

values
[2, 3, 4, 5, 6, 7, 8, 9, 10, 'knight', 'queen', 'king', 'ace']
# list[start: end: step]
values[1: 5]
[3, 4, 5, 6]
values[0: 13: 2] # every second element
[2, 4, 6, 8, 10, 'queen', 'ace']
values[::2]
[2, 4, 6, 8, 10, 'queen', 'ace']
# the first 9 elements
values[:9]
[2, 3, 4, 5, 6, 7, 8, 9, 10]
# the last 4 elements
values[-4:]
['knight', 'queen', 'king', 'ace']
#cmopare
"Hello world"[-4:]
'orld'

Dictionaries#

d = {'a': 1, 'b': 2}
len(d)
2
d.keys()
dict_keys(['a', 'b'])
d.values()
dict_values([1, 2])
d['a']
1
d['b']
2
{'a': 1, 'b': 2} == {'b': 2, 'a': 1}
True
# Card game
game = {
    'player1': [],
    'player2': []
}

deck = [ ('hearts', 2), ('clubs', 4), ('diamonds', 10), ('spades', 5) ]
game['player1'].append(deck.pop())
print(game)
print(deck)
game['player2'].append(deck.pop())
print(game)
print(deck)
game['player1'].append(deck.pop())
print(game)
print(deck)
game['player2'].append(deck.pop())
print(game)
print(deck)
{'player1': [('spades', 5)], 'player2': []}
[('hearts', 2), ('clubs', 4), ('diamonds', 10)]
{'player1': [('spades', 5)], 'player2': [('diamonds', 10)]}
[('hearts', 2), ('clubs', 4)]
{'player1': [('spades', 5), ('clubs', 4)], 'player2': [('diamonds', 10)]}
[('hearts', 2)]
{'player1': [('spades', 5), ('clubs', 4)], 'player2': [('diamonds', 10), ('hearts', 2)]}
[]
game
{'player1': [('spades', 5), ('clubs', 4)],
 'player2': [('diamonds', 10), ('hearts', 2)]}

Repetition#

for-loops#

colours
['spades', 'hearts', 'diamonds', 'clubs']
deck = []
for colour in colours:
    print(colour)
    for value in values:
        print(value, end=' ')
        card = (colour, value)
        deck.append(card)
    print()
spades
2 3 4 5 6 7 8 9 10 knight queen king ace 
hearts
2 3 4 5 6 7 8 9 10 knight queen king ace 
diamonds
2 3 4 5 6 7 8 9 10 knight queen king ace 
clubs
2 3 4 5 6 7 8 9 10 knight queen king ace 
len(deck)
52
#for x in game.keys():
for player in game:
    print(player)
    print(f"Player {player} has hand {game[player]}")
player1
Player player1 has hand [('spades', 5), ('clubs', 4)]
player2
Player player2 has hand [('diamonds', 10), ('hearts', 2)]
for x in game.items():
    print(x)
('player1', [('spades', 5), ('clubs', 4)])
('player2', [('diamonds', 10), ('hearts', 2)])
for x in game.items():
    player = x[0]
    hand = x[1]
    print(f"Player {player} has hand {hand}")
Player player1 has hand [('spades', 5), ('clubs', 4)]
Player player2 has hand [('diamonds', 10), ('hearts', 2)]
for x in game.items():
    player, hand = x
    print(f"Player {player} has hand {hand}")
Player player1 has hand [('spades', 5), ('clubs', 4)]
Player player2 has hand [('diamonds', 10), ('hearts', 2)]
for player, hand in game.items():
    print(f"Player {player} has hand {hand}")
Player player1 has hand [('spades', 5), ('clubs', 4)]
Player player2 has hand [('diamonds', 10), ('hearts', 2)]

Branching#

if statements#

# game where largest sum of value wins
if False:
    print('yes')
else:
    print('no')
no
# sum up card values for player 1
value1 = 0
for card in game['player1']:
    value1 = value1 + card[1]
print(value1)
9
# sum up card values for player 2
value2 = 0
for card in game['player2']:
    value2 = value2 + card[1]
print(value2)
12
# The player with largest value wins
if value1 > value2:
    print("Player1 wins")
else:
    print("Player2 wins")
Player2 wins

Functions#

print
<function print(*args, sep=' ', end='\n', file=None, flush=False)>
print("Hello")
Hello
len
<function len(obj, /)>
len('abc')
3
len([1,2,])
2
len({'a': 1})
1
input
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x71a69fe93620>>
input()
'hoho'
#dir()
dir(__builtins__)
['ArithmeticError',
 'AssertionError',
 'AttributeError',
 'BaseException',
 'BaseExceptionGroup',
 'BlockingIOError',
 'BrokenPipeError',
 'BufferError',
 'BytesWarning',
 'ChildProcessError',
 'ConnectionAbortedError',
 'ConnectionError',
 'ConnectionRefusedError',
 'ConnectionResetError',
 'DeprecationWarning',
 'EOFError',
 'Ellipsis',
 'EncodingWarning',
 'EnvironmentError',
 'Exception',
 'ExceptionGroup',
 'False',
 'FileExistsError',
 'FileNotFoundError',
 'FloatingPointError',
 'FutureWarning',
 'GeneratorExit',
 'IOError',
 'ImportError',
 'ImportWarning',
 'IndentationError',
 'IndexError',
 'InterruptedError',
 'IsADirectoryError',
 'KeyError',
 'KeyboardInterrupt',
 'LookupError',
 'MemoryError',
 'ModuleNotFoundError',
 'NameError',
 'None',
 'NotADirectoryError',
 'NotImplemented',
 'NotImplementedError',
 'OSError',
 'OverflowError',
 'PendingDeprecationWarning',
 'PermissionError',
 'ProcessLookupError',
 'PythonFinalizationError',
 'RecursionError',
 'ReferenceError',
 'ResourceWarning',
 'RuntimeError',
 'RuntimeWarning',
 'StopAsyncIteration',
 'StopIteration',
 'SyntaxError',
 'SyntaxWarning',
 'SystemError',
 'SystemExit',
 'TabError',
 'TimeoutError',
 'True',
 'TypeError',
 'UnboundLocalError',
 'UnicodeDecodeError',
 'UnicodeEncodeError',
 'UnicodeError',
 'UnicodeTranslateError',
 'UnicodeWarning',
 'UserWarning',
 'ValueError',
 'Warning',
 'ZeroDivisionError',
 '_IncompleteInputError',
 '__IPYTHON__',
 '__build_class__',
 '__debug__',
 '__doc__',
 '__import__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'abs',
 'aiter',
 'all',
 'anext',
 'any',
 'ascii',
 'bin',
 'bool',
 'breakpoint',
 'bytearray',
 'bytes',
 'callable',
 'chr',
 'classmethod',
 'compile',
 'complex',
 'copyright',
 'credits',
 'delattr',
 'dict',
 'dir',
 'display',
 'divmod',
 'enumerate',
 'eval',
 'exec',
 'execfile',
 'filter',
 'float',
 'format',
 'frozenset',
 'get_ipython',
 'getattr',
 'globals',
 'hasattr',
 'hash',
 'help',
 'hex',
 'id',
 'input',
 'int',
 'isinstance',
 'issubclass',
 'iter',
 'len',
 'license',
 'list',
 'locals',
 'map',
 'max',
 'memoryview',
 'min',
 'next',
 'object',
 'oct',
 'open',
 'ord',
 'pow',
 'print',
 'property',
 'range',
 'repr',
 'reversed',
 'round',
 'runfile',
 'set',
 'setattr',
 'slice',
 'sorted',
 'staticmethod',
 'str',
 'sum',
 'super',
 'tuple',
 'type',
 'vars',
 'zip']
# define our own function, make this a function
"""
value1 = 0
for card in game['player1']:
    value1 = value1 + card[1]
print(value1)
"""
"\nvalue1 = 0\nfor card in game['player1']:\n    value1 = value1 + card[1]\nprint(value1)\n"

A function definition

  • start with the def keyword, a name, parentheses, colon

  • function body (indented)

  • values are passed to the function as arguments

  • parameters of a function are variables the hold these values

  • variables defined inside the function are not valid outside the function

The lines in a function body are executed when the function is called

  • the value of a function call is defined by a return statement in the function body

  • the None object represents absence of something, is returned in the absence of a return statement

global scope: variables not defined in any function local scope: variable defined inside a function (only valid locally)

a function can use value in the global scope

#define the function
def calculate_hand_value(card_game, player):
    value = 0
    for card in card_game[player]:
        value = value + card[1]
    print(value)
    return value
# call function
calculate_hand_value(game, 'player1') #value of game is passed to function parameter card_game, 'player1' is passed to parameter player
calculate_hand_value(game, 'player2')
9
12
12
result = calculate_hand_value(game, 'player1')
result == 9
9
True
print(result)
9
print(value)
ace
# alternative with global game
def calculate_hand_value2(player):
    value = 0
    for card in game[player]:
        value = value + card[1]
    print(value)
    return value
calculate_hand_value2('player1')
9
9
print?
Signature: print(*args, sep=' ', end='\n', file=None, flush=False)
Docstring:
Prints the values to a stream, or to sys.stdout by default.

sep
  string inserted between values, default a space.
end
  string appended after the last value, default a newline.
file
  a file-like object (stream); defaults to the current sys.stdout.
flush
  whether to forcibly flush the stream.
Type:      builtin_function_or_method
print('Hello', 'world', end='')  # args parameter will be the tuple ('Hello', 'world')
print('Hello', 'world', sep='-')
Hello worldHello-world
# previous example with keyword
# alternative with global game as an optional argument
def calculate_hand_value3(player, card_game=game):
    """
    Calculate the value of the hand of player
    """
    _value = 0
    for _card in card_game[player]:
        _value = _value + _card[1]
    return _value
calculate_hand_value3?
Signature:
calculate_hand_value3(
    player,
    card_game={'player1': [('spades', 5), ('clubs', 4)], 'player2': [('diamonds', 10), ('hearts', 2)]},
)
Docstring: Calculate the value of the hand of player
File:      /tmp/ipykernel_1077854/3892734418.py
Type:      function
help(calculate_hand_value3)
Help on function calculate_hand_value3 in module __main__:

calculate_hand_value3(
    player,
    card_game={'player1': [('spades', 5), ('clubs', 4)], 'player2': [('diamonds', 10), ('hearts', 2)]}
)
    Calculate the value of the hand of player
calculate_hand_value3('player1')
9
_value # only defined locally inside the function, not globally
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[120], line 1
----> 1 _value # only defined locally inside the function, not globally

NameError: name '_value' is not defined

Modules#

import hello
The value of __name__ is hello
dir(hello)
['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'hello_world',
 'sys']
hello.hello_world()
Hello -f
Goodbye
  • import on a python file runs all code in it

  • definitions in the file are saved in so called namespace

# %load hello.py
import sys

def hello_world():
    if len(sys.argv) > 1:
        print("Hello", sys.argv[1])
    else:
        print("Hello")
    print("Goodbye")


if __name__ == "__main__": #do not run this code during import
     hello_world()
    
print("The value of __name__ is", __name__)
#import os
#dir(os)
import math
dir(math)
['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'acos',
 'acosh',
 'asin',
 'asinh',
 'atan',
 'atan2',
 'atanh',
 'cbrt',
 'ceil',
 'comb',
 'copysign',
 'cos',
 'cosh',
 'degrees',
 'dist',
 'e',
 'erf',
 'erfc',
 'exp',
 'exp2',
 'expm1',
 'fabs',
 'factorial',
 'floor',
 'fma',
 'fmod',
 'frexp',
 'fsum',
 'gamma',
 'gcd',
 'hypot',
 'inf',
 'isclose',
 'isfinite',
 'isinf',
 'isnan',
 'isqrt',
 'lcm',
 'ldexp',
 'lgamma',
 'log',
 'log10',
 'log1p',
 'log2',
 'modf',
 'nan',
 'nextafter',
 'perm',
 'pi',
 'pow',
 'prod',
 'radians',
 'remainder',
 'sin',
 'sinh',
 'sqrt',
 'sumprod',
 'tan',
 'tanh',
 'tau',
 'trunc',
 'ulp']
math.pi
3.141592653589793
math.sin(math.pi)
1.2246467991473532e-16
math.sin(math.pi/2)
1.0
#optional import
from math import pi, sin, cos
sin(pi/4)
0.7071067811865475
cos(pi/4)
0.7071067811865476

Files#

f = open('hello.txt', mode='r') # open for reading an existing file
#dir(f)
f.read()
'Hello\nGoodbye\n'
f.read()
''
f = open('hello.txt', mode='r') # open for reading an existing file
f.readable()
f.readline() # read one lnie at a tome
'Hello\n'
f.readline()
'Goodbye\n'
f.readline()
''
f = open('hello.txt', mode='r') # open for reading an existing file

f.readlines() # read the file into a list of lines
['Hello\n', 'Goodbye\n']
f = open('hello.txt', mode='r') # open for reading an existing file
for line in f:
    #print(line.upper(), end='')
    print(line.strip().upper())
HELLO
GOODBYE
new = open('newhello.txt', mode='w')
new.write("Hello")
new.write("World!")
6
!ls -l
total 3600
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25   15308 maj 16 14:23 advanced-2025-05-16.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  256100 apr 15 08:59 child_mortality_0_5_year_olds_dying_per_1000_born.csv
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25   21434 apr 25 14:03 classes-2025-04-25.ipynb
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25    5000 apr  8 15:58 data.txt
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25    1816 mar 31 16:00 Demo.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  298018 apr 15 08:58 gdp_pcap.csv
drwxrwxr-x 4 bb1000-vt25 bb1000-vt25    4096 maj  9 14:40 git-2025-05-09
drwxrwxr-x 6 bb1000-vt25 bb1000-vt25    4096 maj 19 16:26 git_demo
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     280 apr  1 08:49 hello.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25      14 mar 31 16:46 hello.txt
drwxrwxr-x 3 bb1000-vt25 bb1000-vt25    4096 apr 28 16:43 htmlcov
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     241 apr 28 16:52 leap.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     149 mar 31 16:30 mult_table.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25      35 apr 29 09:32 my_math.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25       0 maj 20 10:21 newhello.txt
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25 1332300 maj 20 10:20 Notes.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25    2750 apr 14 16:42 numbers.txt
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25   41855 apr 11 13:55 numpy-2025-04-11.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25   21699 apr 11 13:24 numpy-2025-04-11.zip
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25      18 apr 14 16:21 omdb.py
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25  425009 apr 14 16:34 pandas-2025-04-11.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  306031 apr 14 15:20 pandas-2025-04-11.zip
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     116 apr  8 09:31 plotdemo.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  335155 apr 15 09:01 pop.csv
drwxrwxr-x 2 bb1000-vt25 bb1000-vt25    4096 apr 29 10:42 __pycache__
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  493778 apr 15 10:21 rosling.png
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25   18244 apr 29 10:53 testing-2025-04-29.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     323 apr 28 16:34 test_leap.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     169 apr 29 09:32 test_my_math.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     323 apr 29 10:45 test_timestamps.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     316 apr 29 10:26 timestamps1.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     292 apr 29 10:38 timestamps2.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     640 apr 29 09:55 timestamps.py
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25    6742 apr 14 14:49 TSLA.csv
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     113 apr 29 09:37 untitled.md
new.close()
!ls -l newhello.txt
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25 11 maj 20 10:21 newhello.txt
!cat newhello.txt
HelloWorld!
# use write method to get separate lines
with open('newhello.txt', 'w') as new: 
    new.write("Hello\n")
    new.write("World!\n")
!cat newhello.txt
Hello
World!
# add text to end of a file
with open("newhello.txt", 'a') as f:
    f.write("Goodbye")
!cat newhello.txt
Hello
World!
Goodbye

pathlib module#

import pathlib
ls /home/bb1000-vt25/Downloads/MOCK_DATA.csv
/home/bb1000-vt25/Downloads/MOCK_DATA.csv
# in windows   C:\HOME\BB1000\DOWNLOADS\MOCK_DATA.csv
datafile = pathlib.Path.home() / 'Downloads' / 'MOCK_DATA.csv'
datafile
PosixPath('/home/bb1000-vt25/Downloads/MOCK_DATA.csv')
open(datafile).readline()
'id,first_name,last_name,email,gender,ip_address,employee_id,salary,department\n'
#compare mean salaries for different genders
all_lines = open(datafile).readlines()
gender_data = {} # {'gender': [salaries..]}
for line in all_lines[1:]:
    fields = line.split(',') # fields are values of single row/line
    gender = fields[4]
    salary = int(fields[7])
    if gender in gender_data: # is gender present as a key in the dictionary gender_data?
        gender_data[gender].append(salary)
    else:
        gender_data[gender] = [salary]
    
gender_data.keys()
dict_keys(['Male', 'Genderfluid', 'Female', 'Genderqueer', 'Non-binary', 'Bigender', 'Polygender', 'Agender'])
for gender, salaries in gender_data.items():
    mean = round(sum(salaries)/len(salaries))
    print(gender, mean, len(salaries))
Male 84138 444
Genderfluid 95577 17
Female 86417 448
Genderqueer 86913 24
Non-binary 86858 17
Bigender 87399 20
Polygender 77018 18
Agender 96232 12

csv module#

import csv
#dir(csv)
for line in csv.reader(open(datafile)):
    print(line)
    break
['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address', 'employee_id', 'salary', 'department']
csv.reader?
Docstring:
csv_reader = reader(iterable [, dialect='excel']
                        [optional keyword args])
    for row in csv_reader:
        process(row)

The "iterable" argument can be any object that returns a line
of input for each iteration, such as a file object or a list.  The
optional "dialect" parameter is discussed below.  The function
also accepts optional keyword arguments which override settings
provided by the dialect.

The returned object is an iterator.  Each iteration returns a row
of the CSV file (which can span multiple input lines).
Type:      builtin_function_or_method
all_lines = list(csv.reader(open(datafile)))
all_lines[1:5]
[['1',
  'Lucius',
  'Feehery',
  'lfeehery0@wiley.com',
  'Male',
  '155.143.110.178',
  '1',
  '123759',
  'Legal'],
 ['2',
  'Melina',
  'Jossel',
  'mjossel1@slideshare.net',
  'Genderfluid',
  '238.182.167.147',
  '2',
  '91045',
  'Support'],
 ['3',
  'Davide',
  'Allin',
  'dallin2@com.com',
  'Male',
  '151.26.69.126',
  '3',
  '66863',
  'Accounting'],
 ['4',
  'Angeline',
  'Got',
  'agot3@amazon.com',
  'Female',
  '225.181.149.174',
  '4',
  '45217',
  'Human Resources']]
csv.DictReader?
Init signature:
csv.DictReader(
    f,
    fieldnames=None,
    restkey=None,
    restval=None,
    dialect='excel',
    *args,
    **kwds,
)
Docstring:      <no docstring>
File:           ~/miniconda3/envs/bb1000/lib/python3.13/csv.py
Type:           type
Subclasses:     
for line in c: # line will be a dictionary with header values as keys
    print(line)
    break
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[167], line 1
----> 1 for line in c: # line will be a dictionary with header values as keys
      2     print(line)
      3     break

NameError: name 'c' is not defined
gender_data = {} # {'gender': [salaries..]}
for line in csv.DictReader(open(datafile)):
    salary = int(line['salary'])
    gender = line['gender']
    if gender in gender_data: # is gender present as a key in the dictionary gender_data?
        gender_data[gender].append(salary)
    else:
        gender_data[gender] = [salary]
for gender, salaries in gender_data.items():
    mean = round(sum(salaries)/len(salaries))
    print(gender, mean, len(salaries))
Male 84138 444
Genderfluid 95577 17
Female 86417 448
Genderqueer 86913 24
Non-binary 86858 17
Bigender 87399 20
Polygender 77018 18
Agender 96232 12
# with defaultdict from the collections module

import collections
collections.defaultdict?
Init signature: collections.defaultdict(self, /, *args, **kwargs)
Docstring:     
defaultdict(default_factory=None, /, [...]) --> dict with default factory

The default factory is called without arguments to produce
a new value when a key is not present, in __getitem__ only.
A defaultdict compares equal to a dict with the same items.
All remaining arguments are treated the same as if they were
passed to the dict constructor, including keyword arguments.
File:           ~/miniconda3/envs/bb1000/lib/python3.13/collections/__init__.py
Type:           type
Subclasses:     FreezableDefaultDict
counter = collections.defaultdict(int)
int()
0
counter
defaultdict(int, {})
counter['newkey']
0
counter
defaultdict(int, {'newkey': 0})
counter['another key'] = counter['anohter key'] + 1
counter
defaultdict(int, {'newkey': 0, 'anohter key': 0, 'another key': 1})
# our previous example
#gender_data = {} # {'gender': [salaries..]}
gender_data = collections.defaultdict(list)
for line in csv.DictReader(open(datafile)):
    salary = int(line['salary'])
    gender = line['gender']
    #if gender in gender_data: # is gender present as a key in the dictionary gender_data?
    #    gender_data[gender].append(salary)
    #else:
    #    gender_data[gender] = [salary]
    gender_data[gender].append(salary)
for gender, salaries in gender_data.items():
    mean = round(sum(salaries)/len(salaries))
    print(gender, mean, len(salaries))
Male 84138 444
Genderfluid 95577 17
Female 86417 448
Genderqueer 86913 24
Non-binary 86858 17
Bigender 87399 20
Polygender 77018 18
Agender 96232 12
def select_second(t):
    return t[1]
# print employees by department sorted by last name
department_personel = collections.defaultdict(list)
for line in csv.DictReader(open(datafile)):
    dept = line['department']
    name = (line['first_name'], line['last_name'])
    #print(line, dept, name, sep='\n')
    #break
    department_personel[dept].append(name)

for dept, names in department_personel.items():
    names.sort(key=select_second)
    print('Department:', dept)
    for first, last in names:
        print(f'\t{first} {last}')

    break
Department: Legal
	Alejoa Aleksankin
	Hollie Attenborrow
	Benetta Bangley
	Arlena Broxton
	Ferd Burriss
	Jenna Callacher
	Valdemar Canland
	Dov Cleverly
	Quinlan Coslett
	Lucho Coules
	Ronnie Cowthart
	Marieann Daughtrey
	Dacy De la croix
	Tucker Deeming
	Pris Deinhardt
	Terri Dobbson
	Homerus Donwell
	Sile Dunderdale
	Daniele Eloi
	Horatius Etherington
	Lucius Feehery
	Marcile Fitzackerley
	Valencia Galtone
	Hugibert Garrie
	Mozelle Gerant
	Kacey Girodier
	Bobinette Gratten
	Ariela Greenstock
	Freddie Gricks
	Meaghan Guinness
	Rheta Handrik
	Maggi Huller
	Nicholas Hum
	Marti Jodlkowski
	Allianora Kaasman
	Hersh Karpenko
	Marcos Kulic
	Nap Ledur
	Allix Lidgard
	Humfried Lympany
	Maynord Manterfield
	Corella Mattussevich
	Samson Mayman
	Wald McCuthais
	Klement McGeown
	Madison Middler
	Randy Moat
	Dagmar Mote
	Chantal Mumby
	Peadar Murtell
	Thorsten Muscott
	Dione Norwich
	Ive O'Codihie
	Elysha Orrick
	Gusella Peach
	Cordie Peasgood
	Tiphany Philpotts
	Bobby Picard
	Cordie Plaskitt
	Hillie Porteous
	Field Poulsom
	Millisent Praten
	Barris Puckinghorne
	Reggy Raden
	Lacee Rulten
	Tersina Shaefer
	Beverlee Sharper
	Montague Sibbs
	Maxwell Simeoni
	Nita Stanborough
	Welsh Stelle
	Urbanus Sullly
	Alex Szymoni
	Rowen Taffrey
	Alyosha Taggett
	Dyan Talloe
	Justen Tessington
	Francisco Tomaselli
	Urbano Trevear
	Victoir Warsop
	Meris Whalley
	Ricardo Yeaman
# how to sort a list fo tuples?

example_list = [('c', 'b'), ('a', 'd'), ('a', 'c')]
example_list.sort()
example_list
[('a', 'c'), ('a', 'd'), ('c', 'b')]
example_list.sort?
Signature: example_list.sort(*, key=None, reverse=False)
Docstring:
Sort the list in ascending order and return None.

The sort is in-place (i.e. the list itself is modified) and stable (i.e. the
order of two equal elements is maintained).

If a key function is given, apply it once to each list item and sort them,
ascending or descending, according to their function values.

The reverse flag can be set to sort in descending order.
Type:      builtin_function_or_method
example_list.sort(key=select_second)
example_list
[('c', 'b'), ('a', 'c'), ('a', 'd')]
csv.DictReader??
Init signature:
csv.DictReader(
    f,
    fieldnames=None,
    restkey=None,
    restval=None,
    dialect='excel',
    *args,
    **kwds,
)
Docstring:      <no docstring>
Source:        
class DictReader:
    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
                 dialect="excel", *args, **kwds):
        if fieldnames is not None and iter(fieldnames) is fieldnames:
            fieldnames = list(fieldnames)
        self._fieldnames = fieldnames   # list of keys for the dict
        self.restkey = restkey          # key to catch long rows
        self.restval = restval          # default value for short rows
        self.reader = reader(f, dialect, *args, **kwds)
        self.dialect = dialect
        self.line_num = 0

    def __iter__(self):
        return self

    @property
    def fieldnames(self):
        if self._fieldnames is None:
            try:
                self._fieldnames = next(self.reader)
            except StopIteration:
                pass
        self.line_num = self.reader.line_num
        return self._fieldnames

    @fieldnames.setter
    def fieldnames(self, value):
        self._fieldnames = value

    def __next__(self):
        if self.line_num == 0:
            # Used only for its side effect.
            self.fieldnames
        row = next(self.reader)
        self.line_num = self.reader.line_num

        # unlike the basic reader, we prefer not to return blanks,
        # because we will typically wind up with a dict full of None
        # values
        while row == []:
            row = next(self.reader)
        d = dict(zip(self.fieldnames, row))
        lf = len(self.fieldnames)
        lr = len(row)
        if lf < lr:
            d[self.restkey] = row[lf:]
        elif lf > lr:
            for key in self.fieldnames[lr:]:
                d[key] = self.restval
        return d

    __class_getitem__ = classmethod(types.GenericAlias)
File:           ~/miniconda3/envs/bb1000/lib/python3.13/csv.py
Type:           type
Subclasses:     

External libraries#

import numpy
#dir(numpy)
import numpy.linalg
dir(numpy.linalg)
['LinAlgError',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_linalg',
 '_umath_linalg',
 'cholesky',
 'cond',
 'cross',
 'det',
 'diagonal',
 'eig',
 'eigh',
 'eigvals',
 'eigvalsh',
 'inv',
 'linalg',
 'lstsq',
 'matmul',
 'matrix_norm',
 'matrix_power',
 'matrix_rank',
 'matrix_transpose',
 'multi_dot',
 'norm',
 'outer',
 'pinv',
 'qr',
 'slogdet',
 'solve',
 'svd',
 'svdvals',
 'tensordot',
 'tensorinv',
 'tensorsolve',
 'test',
 'trace',
 'vecdot',
 'vector_norm']
numpy.linalg.solve?
Signature:       numpy.linalg.solve(a, b)
Call signature:  numpy.linalg.solve(*args, **kwargs)
Type:            _ArrayFunctionDispatcher
String form:     <function solve at 0x71a69c5f8ea0>
File:            ~/miniconda3/envs/bb1000/lib/python3.13/site-packages/numpy/linalg/_linalg.py
Docstring:      
Solve a linear matrix equation, or system of linear scalar equations.

Computes the "exact" solution, `x`, of the well-determined, i.e., full
rank, linear matrix equation `ax = b`.

Parameters
----------
a : (..., M, M) array_like
    Coefficient matrix.
b : {(M,), (..., M, K)}, array_like
    Ordinate or "dependent variable" values.

Returns
-------
x : {(..., M,), (..., M, K)} ndarray
    Solution to the system a x = b.  Returned shape is (..., M) if b is
    shape (M,) and (..., M, K) if b is (..., M, K), where the "..." part is
    broadcasted between a and b.

Raises
------
LinAlgError
    If `a` is singular or not square.

See Also
--------
scipy.linalg.solve : Similar function in SciPy.

Notes
-----
Broadcasting rules apply, see the `numpy.linalg` documentation for
details.

The solutions are computed using LAPACK routine ``_gesv``.

`a` must be square and of full-rank, i.e., all rows (or, equivalently,
columns) must be linearly independent; if either is not true, use
`lstsq` for the least-squares best "solution" of the
system/equation.

.. versionchanged:: 2.0

   The b array is only treated as a shape (M,) column vector if it is
   exactly 1-dimensional. In all other instances it is treated as a stack
   of (M, K) matrices. Previously b would be treated as a stack of (M,)
   vectors if b.ndim was equal to a.ndim - 1.

References
----------
.. [1] G. Strang, *Linear Algebra and Its Applications*, 2nd Ed., Orlando,
       FL, Academic Press, Inc., 1980, pg. 22.

Examples
--------
Solve the system of equations:
``x0 + 2 * x1 = 1`` and
``3 * x0 + 5 * x1 = 2``:

>>> import numpy as np
>>> a = np.array([[1, 2], [3, 5]])
>>> b = np.array([1, 2])
>>> x = np.linalg.solve(a, b)
>>> x
array([-1.,  1.])

Check that the solution is correct:

>>> np.allclose(np.dot(a, x), b)
True
Class docstring:
Class to wrap functions with checks for __array_function__ overrides.

All arguments are required, and can only be passed by position.

Parameters
----------
dispatcher : function or None
    The dispatcher function that returns a single sequence-like object
    of all arguments relevant.  It must have the same signature (except
    the default values) as the actual implementation.
    If ``None``, this is a ``like=`` dispatcher and the
    ``_ArrayFunctionDispatcher`` must be called with ``like`` as the
    first (additional and positional) argument.
implementation : function
    Function that implements the operation on NumPy arrays without
    overrides.  Arguments passed calling the ``_ArrayFunctionDispatcher``
    will be forwarded to this (and the ``dispatcher``) as if using
    ``*args, **kwargs``.

Attributes
----------
_implementation : function
    The original implementation passed in.
a = [[3, 0, 0], [1, 8, 0], [0, 4, -2]]
b = [30, 18, 2]
x = numpy.linalg.solve(a, b)
x
array([10.,  1.,  1.])
x[0] + 3*x[1] + x[2]
np.float64(14.0)
c = numpy.array([1, 3, 1])
c
array([1, 3, 1])
numpy.dot(x, c)
np.float64(14.0)
numpy.linspace(0, 1, 11)
array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
list(range(0, 11, 1))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
numpy.arange(0, 11, .1)
array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,
        1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,
        2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,
        3.3,  3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,
        4.4,  4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,
        5.5,  5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,
        6.6,  6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,
        7.7,  7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,
        8.8,  8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,
        9.9, 10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9])
numpy.savetxt('numbers.txt', numpy.arange(0, 11, .1).reshape((11, 10)))
numbers = numpy.loadtxt('numbers.txt')
numbers
array([[ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9],
       [ 1. ,  1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9],
       [ 2. ,  2.1,  2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9],
       [ 3. ,  3.1,  3.2,  3.3,  3.4,  3.5,  3.6,  3.7,  3.8,  3.9],
       [ 4. ,  4.1,  4.2,  4.3,  4.4,  4.5,  4.6,  4.7,  4.8,  4.9],
       [ 5. ,  5.1,  5.2,  5.3,  5.4,  5.5,  5.6,  5.7,  5.8,  5.9],
       [ 6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,  6.7,  6.8,  6.9],
       [ 7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,  7.8,  7.9],
       [ 8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,  8.9],
       [ 9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9],
       [10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9]])
# corresponds to
value_list = []
for line in open('numbers.txt'):
    value_list.append([])
    for element in line.split():
        value = float(element)
        value_list[-1].append(value)
value_array= numpy.array(value_list)
value_array
array([[ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9],
       [ 1. ,  1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9],
       [ 2. ,  2.1,  2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9],
       [ 3. ,  3.1,  3.2,  3.3,  3.4,  3.5,  3.6,  3.7,  3.8,  3.9],
       [ 4. ,  4.1,  4.2,  4.3,  4.4,  4.5,  4.6,  4.7,  4.8,  4.9],
       [ 5. ,  5.1,  5.2,  5.3,  5.4,  5.5,  5.6,  5.7,  5.8,  5.9],
       [ 6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,  6.7,  6.8,  6.9],
       [ 7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,  7.8,  7.9],
       [ 8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,  8.9],
       [ 9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9],
       [10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9]])
numbers[:, : ]# all
numbers[:, -1 ] #last column
numbers[:1, :] # first row
numbers[:3, :3] # upper left 3x3 subblock
array([[0. , 0.1, 0.2],
       [1. , 1.1, 1.2],
       [2. , 2.1, 2.2]])
import time
import numpy
n = 512
a = numpy.ones((n, n))
b = numpy.ones((n, n))
c = numpy.zeros((n, n))
t1 = time.time()
for i in range(n):
    for j in range(n):
        for k in range(n):
            c[i, j] += a[i, k]*b[k, j]
t2 = time.time()
print("Loop timing", t2-t1)
Loop timing 118.91819548606873
t1 = time.time()
c = a @ b
t2 = time.time()
print("Loop timing", t2-t1)
Loop timing 0.03766798973083496
arr = numbers[:3, :3]
arr
array([[0. , 0.1, 0.2],
       [1. , 1.1, 1.2],
       [2. , 2.1, 2.2]])
numpy.linalg.inv(arr)
---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
Cell In[207], line 1
----> 1 numpy.linalg.inv(arr)

File ~/miniconda3/envs/bb1000/lib/python3.13/site-packages/numpy/linalg/_linalg.py:609, in inv(a)
    606 signature = 'D->D' if isComplexType(t) else 'd->d'
    607 with errstate(call=_raise_linalgerror_singular, invalid='call',
    608               over='ignore', divide='ignore', under='ignore'):
--> 609     ainv = _umath_linalg.inv(a, signature=signature)
    610 return wrap(ainv.astype(result_t, copy=False))

File ~/miniconda3/envs/bb1000/lib/python3.13/site-packages/numpy/linalg/_linalg.py:104, in _raise_linalgerror_singular(err, flag)
    103 def _raise_linalgerror_singular(err, flag):
--> 104     raise LinAlgError("Singular matrix")

LinAlgError: Singular matrix
numpy.linalg.det(arr)
np.float64(0.0)
a = numpy.array([[3, 0, 0], [1, 8, 0], [0, 4, -2]])
b = numpy.array([30, 18, 2])
x = numpy.linalg.solve(a, b)
x
array([10.,  1.,  1.])
a @ x
array([30., 18.,  2.])
numpy.linalg.inv(a) @ a @ x
array([10.,  1.,  1.])
# a @ x = b
# a(inv) @ a @ x = a(inv) @ b
# x = a(inv) @ b
numpy.linalg.inv(a) @ b
array([10.,  1.,  1.])

Matplotlib#

import matplotlib.pyplot as plt
x = numpy.arange(0, 2*3.1415, .1)
x
array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2,
       1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5,
       2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8,
       3.9, 4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1,
       5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2])
numpy.sin(x)
array([ 0.        ,  0.09983342,  0.19866933,  0.29552021,  0.38941834,
        0.47942554,  0.56464247,  0.64421769,  0.71735609,  0.78332691,
        0.84147098,  0.89120736,  0.93203909,  0.96355819,  0.98544973,
        0.99749499,  0.9995736 ,  0.99166481,  0.97384763,  0.94630009,
        0.90929743,  0.86320937,  0.8084964 ,  0.74570521,  0.67546318,
        0.59847214,  0.51550137,  0.42737988,  0.33498815,  0.23924933,
        0.14112001,  0.04158066, -0.05837414, -0.15774569, -0.2555411 ,
       -0.35078323, -0.44252044, -0.52983614, -0.61185789, -0.68776616,
       -0.7568025 , -0.81827711, -0.87157577, -0.91616594, -0.95160207,
       -0.97753012, -0.993691  , -0.99992326, -0.99616461, -0.98245261,
       -0.95892427, -0.92581468, -0.88345466, -0.83226744, -0.77276449,
       -0.70554033, -0.63126664, -0.55068554, -0.46460218, -0.37387666,
       -0.2794155 , -0.1821625 , -0.0830894 ])
plt.plot(x, numpy.sin(x), label='sin')
plt.plot(x, numpy.cos(x), label='cos')
plt.legend()
plt.show()
_images/ff542241aca2b529cb4740f7506ef027c8a93ce007500ca152bfd6cd0591a379.png

Pandas#

import pandas as pd
s = pd.Series(range(4))/10
s
0    0.0
1    0.1
2    0.2
3    0.3
dtype: float64
s[1: 2]
1    0.1
dtype: float64
t = pd.Series(range(4), index=['a', 'b', 'c', 'd'])/10
t
a    0.0
b    0.1
c    0.2
d    0.3
dtype: float64
t['b': 'c']
b    0.1
c    0.2
dtype: float64
# indexing with lists
s[[0, 3]]
0    0.0
3    0.3
dtype: float64
t[['a', 'd']]
a    0.0
d    0.3
dtype: float64
#filtering
s > .1
0    False
1    False
2     True
3     True
dtype: bool
s[[False, True, True, False]]
1    0.1
2    0.2
dtype: float64
s[s > .1]
2    0.2
3    0.3
dtype: float64
s.mean()
np.float64(0.15000000000000002)
#dir(s)

Dataframes#

data = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],
       'population': [11.3, 64.3, 81.3, 16.9, 64.9],
       'area': [30510, 671308, 357050, 41526, 244820],
       'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']}
df = pd.DataFrame(data)
df
country population area capital
0 Belgium 11.3 30510 Brussels
1 France 64.3 671308 Paris
2 Germany 81.3 357050 Berlin
3 Netherlands 16.9 41526 Amsterdam
4 United Kingdom 64.9 244820 London
df.set_index('country')
population area capital
country
Belgium 11.3 30510 Brussels
France 64.3 671308 Paris
Germany 81.3 357050 Berlin
Netherlands 16.9 41526 Amsterdam
United Kingdom 64.9 244820 London
df #unmodified
country population area capital
0 Belgium 11.3 30510 Brussels
1 France 64.3 671308 Paris
2 Germany 81.3 357050 Berlin
3 Netherlands 16.9 41526 Amsterdam
4 United Kingdom 64.9 244820 London
# equivalent to df = df.set_index('country')
df.set_index('country', inplace=True)
df
population area capital
country
Belgium 11.3 30510 Brussels
France 64.3 671308 Paris
Germany 81.3 357050 Berlin
Netherlands 16.9 41526 Amsterdam
United Kingdom 64.9 244820 London
#select elements
df['capital']
country
Belgium            Brussels
France                Paris
Germany              Berlin
Netherlands       Amsterdam
United Kingdom       London
Name: capital, dtype: object
df['capital']['France']
'Paris'
#alt
df.loc['France', 'capital']
'Paris'
# modify element
df['population']['Belgium'] = 11.4 
/tmp/ipykernel_1077854/1863415940.py:2: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['population']['Belgium'] = 11.4
/tmp/ipykernel_1077854/1863415940.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['population']['Belgium'] = 11.4
df.loc['Belgium', 'population'] = 11.4
df
population area capital
country
Belgium 11.4 30510 Brussels
France 64.3 671308 Paris
Germany 81.3 357050 Berlin
Netherlands 16.9 41526 Amsterdam
United Kingdom 64.9 244820 London
df.iloc[0, 0] = 11.5
df
population area capital
country
Belgium 11.5 30510 Brussels
France 64.3 671308 Paris
Germany 81.3 357050 Berlin
Netherlands 16.9 41526 Amsterdam
United Kingdom 64.9 244820 London
df['density'] = df.population / df.area * 1000
df
population area capital density
country
Belgium 11.5 30510 Brussels 0.376926
France 64.3 671308 Paris 0.095783
Germany 81.3 357050 Berlin 0.227699
Netherlands 16.9 41526 Amsterdam 0.406974
United Kingdom 64.9 244820 London 0.265093
df.density > .3
country
Belgium            True
France            False
Germany           False
Netherlands        True
United Kingdom    False
Name: density, dtype: bool
df[df.density > .3]
population area capital density
country
Belgium 11.5 30510 Brussels 0.376926
Netherlands 16.9 41526 Amsterdam 0.406974
df.area /= 1000
df
population area capital density
country
Belgium 11.5 30.510 Brussels 0.376926
France 64.3 671.308 Paris 0.095783
Germany 81.3 357.050 Berlin 0.227699
Netherlands 16.9 41.526 Amsterdam 0.406974
United Kingdom 64.9 244.820 London 0.265093
df.plot()
<Axes: xlabel='country'>
_images/c5f05156364d8585695d40aa85d3bf8428282ad0fd1f6173e3577a9928e17b4b.png
df.plot(kind='bar')
<Axes: xlabel='country'>
_images/e8b31bebe9e7a4d8e184cfbfe054b9d2aef9cce75bdae57e3b7ba7d8dd41766e.png
# reading csv files
datafile
PosixPath('/home/bb1000-vt25/Downloads/MOCK_DATA.csv')
personel = pd.read_csv(datafile)
personel.salary.max()
np.int64(149702)
personel.salary.min()
np.int64(20157)
personel.salary.mean()
np.float64(85548.704)
#dir(personel)
personel.groupby('gender')['salary'].mean()
gender
Agender        96231.916667
Bigender       87399.050000
Female         86417.080357
Genderfluid    95576.764706
Genderqueer    86913.000000
Male           84138.439189
Non-binary     86857.529412
Polygender     77018.111111
Name: salary, dtype: float64
personel.groupby('gender')['salary'].mean().round()
gender
Agender        96232.0
Bigender       87399.0
Female         86417.0
Genderfluid    95577.0
Genderqueer    86913.0
Male           84138.0
Non-binary     86858.0
Polygender     77018.0
Name: salary, dtype: float64
personel.groupby('gender')['salary'].mean().round().astype(int)
gender
Agender        96232
Bigender       87399
Female         86417
Genderfluid    95577
Genderqueer    86913
Male           84138
Non-binary     86858
Polygender     77018
Name: salary, dtype: int64
# get 10 lowest salaries for females, lowest first
personel[personel.gender == 'Female'].sort_values('salary').head(10)
id first_name last_name email gender ip_address employee_id salary department
122 123 Nettie Jandak njandak3e@icq.com Female 166.251.166.227 123 20317 Marketing
12 13 Lonee Staunton lstauntonc@surveymonkey.com Female 138.24.99.81 13 20815 Marketing
155 156 Cymbre Balam cbalam4b@spotify.com Female 55.156.47.9 156 20942 Engineering
339 340 Anna-maria Dadge adadge9f@skyrock.com Female 91.173.33.3 340 21654 Sales
106 107 Amye Schofield aschofield2y@globo.com Female 35.209.149.103 107 21930 Services
583 584 Terri Eat teatg7@wsj.com Female 49.14.126.170 584 23226 Research and Development
286 287 Lillis Jude ljude7y@reddit.com Female 119.211.41.215 287 23616 Engineering
145 146 Karna Eyton keyton41@tripadvisor.com Female 41.142.223.103 146 23657 Accounting
19 20 Noell Gadney ngadneyj@blinklist.com Female 218.219.155.4 20 23794 Research and Development
191 192 Golda Cogswell gcogswell5b@jigsy.com Female 141.109.221.182 192 24087 Support
personel.plot(kind='box', column='salary', by='gender', vert=False)
salary    Axes(0.125,0.11;0.775x0.77)
dtype: object
_images/c77f06e3bfbf02b1beb6843b3ad645777c282300673a8ccfb524649808276c13.png
personel.groupby('gender')['salary'].describe()
count mean std min 25% 50% 75% max
gender
Agender 12.0 96231.916667 33622.378677 38414.0 72965.75 101333.5 119789.75 149702.0
Bigender 20.0 87399.050000 37020.164347 28684.0 52147.25 95875.5 115418.00 138593.0
Female 448.0 86417.080357 38494.376223 20317.0 52904.00 86963.5 121934.00 148645.0
Genderfluid 17.0 95576.764706 32896.581404 36697.0 74373.00 102223.0 120759.00 149576.0
Genderqueer 24.0 86913.000000 42049.463061 22271.0 47929.75 87971.5 121489.25 149019.0
Male 444.0 84138.439189 36493.311676 20157.0 53360.75 82365.0 115660.00 149676.0
Non-binary 17.0 86857.529412 42239.068737 29927.0 48726.00 91899.0 128314.00 148927.0
Polygender 18.0 77018.111111 44610.251539 22343.0 34399.75 69915.5 119753.25 145094.0

Notes on the lab#

x = numpy.arange(10)
y = numpy.arange(1, 20, 2)
len(x), len(y)
(10, 10)
# the coefficient matrix
sum_of_squares_x = x @ x
sum_of_x = sum(x)
sum_ones = len(x)
coefficient_matrix = numpy.array(
[
   [ sum_of_squares_x, sum_of_x],
   [ sum_of_x, len(x)]
])
coefficient_matrix
array([[285,  45],
       [ 45,  10]])
# the right-hand side
rhs = numpy.array([ x @ y, sum(y)])
rhs
array([615, 100])
numpy.linalg.solve(coefficient_matrix, rhs)
array([2., 1.])
### oo plotting
fig, ax = plt.subplots()
_images/1d9291a9ac5bde7c4ddefc7b11da247268e3056dcf3eb1f857e3a5dad85748ba.png
fig, (ax, ax2) = plt.subplots(ncols=2)
#dir(ax)
ax.set_title('Demo title')
ax.set_xlabel('x-data')
ax.set_ylabel('y-data')
ax2.scatter(x, y, marker='o')
ax.plot(x, y, color='red')
[<matplotlib.lines.Line2D at 0x71a6755d4b90>]
_images/d77527930ac61e0e73bf3d1c494e4407033d78700ca159a648021c3c0de33fee.png
import seaborn
seaborn.set_theme(style="ticks", palette="pastel")
seaborn.boxplot(data=personel, y='department', x='salary')
<Axes: xlabel='salary', ylabel='department'>
_images/20dc60c73fdfc4004ba510236b1195222d80f756cda3896cf90b6245cb6a8370.png
seaborn.violinplot(data=personel, y='department', x='salary')
<Axes: xlabel='salary', ylabel='department'>
_images/0a27f26a2aef5f636c9d511e496bae9f07884db8487ee83d931f59cc3eeeb693.png

Interactive plots#

import numpy as np
x = np.arange(-np.pi, np.pi, .1)
import matplotlib.pyplot as plt
plt.plot(x, np.sin(x))
plt.plot(x, np.sin(2*x))
[<matplotlib.lines.Line2D at 0x71a5f1dd9f90>]
_images/d6b751cfeb99dff149a2e55df5f96e9d6e763d50d98d4f4432c794b7c7349f37.png
def sinplot(freq=1.0):
    x = np.arange(-np.pi, np.pi, .1)
    plt.plot(x, np.sin(freq*x))
sinplot(1)
sinplot(2)
_images/d6b751cfeb99dff149a2e55df5f96e9d6e763d50d98d4f4432c794b7c7349f37.png
#!conda install ipywidgets -y
#!conda install jupyterlab_widgets -y
import ipywidgets
freq_slider = ipywidgets.FloatSlider(min=1, max=2, value=1.5)
ipywidgets.interact(sinplot, freq=freq_slider)
<function __main__.sinplot(freq=1.0)>
plt.plot(x, np.exp(-x**2))
a = 2
plt.plot(x, np.exp(-a*x**2))
[<matplotlib.lines.Line2D at 0x71a5f0b6dbd0>]
_images/e0d84405302c408316b1e64e7c76c83b4344618fff62f16ae415095d7bd7997d.png
def gaussplot(a=1):
    plt.plot(x, np.exp(-a*x**2))
gaussplot(a=1)
gaussplot(a=2)
_images/e0d84405302c408316b1e64e7c76c83b4344618fff62f16ae415095d7bd7997d.png
from ipywidgets import interact, FloatSlider
interact(gaussplot, a=FloatSlider(min=1.0, max=2.0))
<function __main__.gaussplot(a=1)>

Reproducing gapminder presentation by Hans Rosling#

Youtube: https://www.ted.com/talks/hans_rosling_new_insights_on_poverty

screenshot

Can we reproduce the figure to some degree? Data is online at https://www.gapminder.org/data/

  • Scatterplot

  • x-value GDP per capita

  • y-values Infant mortality

  • each circle is a country

  • size reflects population

  • color reflects geographic region

import pandas as pd
pop = pd.read_csv('pop.csv').set_index('country')
pop
1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 ... 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100
country
Afghanistan 3.28M 3.28M 3.28M 3.28M 3.28M 3.28M 3.28M 3.28M 3.28M 3.28M ... 124M 125M 126M 126M 127M 128M 128M 129M 130M 130M
Angola 1.57M 1.57M 1.57M 1.57M 1.57M 1.57M 1.57M 1.57M 1.57M 1.57M ... 139M 140M 142M 143M 144M 145M 147M 148M 149M 150M
Albania 400k 402k 404k 405k 407k 409k 411k 413k 414k 416k ... 1.34M 1.32M 1.3M 1.29M 1.27M 1.25M 1.23M 1.22M 1.2M 1.18M
Andorra 2650 2650 2650 2650 2650 2650 2650 2650 2650 2650 ... 52.8k 52.1k 51.5k 50.8k 50.2k 49.6k 49k 48.4k 47.8k 47.2k
UAE 40.2k 40.2k 40.2k 40.2k 40.2k 40.2k 40.2k 40.2k 40.2k 40.2k ... 24.1M 24.3M 24.5M 24.7M 25M 25.2M 25.4M 25.7M 25.9M 26.1M
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Samoa 47.3k 47.3k 47.3k 47.3k 47.3k 47.3k 47.3k 47.2k 47.2k 47.2k ... 370k 372k 374k 375k 377k 378k 380k 381k 382k 384k
Yemen 2.59M 2.59M 2.59M 2.59M 2.59M 2.59M 2.59M 2.59M 2.59M 2.59M ... 107M 107M 107M 108M 108M 109M 109M 109M 110M 110M
South Africa 1.45M 1.45M 1.46M 1.46M 1.47M 1.47M 1.48M 1.49M 1.49M 1.5M ... 92.4M 92.6M 92.9M 93.1M 93.3M 93.5M 93.7M 93.9M 94.1M 94.3M
Zambia 747k 758k 770k 782k 794k 806k 818k 831k 843k 856k ... 61.1M 61.5M 61.9M 62.3M 62.7M 63.1M 63.4M 63.8M 64.1M 64.5M
Zimbabwe 1.09M 1.09M 1.09M 1.09M 1.09M 1.09M 1.09M 1.09M 1.09M 1.09M ... 36.3M 36.4M 36.5M 36.6M 36.7M 36.8M 36.9M 37M 37.1M 37.2M

197 rows × 301 columns

gdp = pd.read_csv('gdp_pcap.csv').set_index('country')
gdp
1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 ... 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100
country
Afghanistan 481 481 481 481 481 481 481 481 481 481 ... 4680 4790 4910 5020 5140 5260 5380 5510 5640 5780
Angola 373 374 376 378 379 381 383 385 386 388 ... 24.5k 25k 25.6k 26.1k 26.6k 27.1k 27.7k 28.2k 28.8k 29.3k
Albania 469 471 472 473 475 476 477 479 480 482 ... 54.5k 55.1k 55.7k 56.3k 56.9k 57.4k 58k 58.6k 59.2k 59.8k
Andorra 1370 1370 1370 1380 1380 1380 1390 1390 1390 1390 ... 79.9k 80.2k 80.4k 80.7k 81k 81.3k 81.5k 81.8k 82k 82.3k
UAE 1140 1150 1150 1150 1160 1160 1170 1170 1180 1180 ... 92.6k 92.6k 92.6k 92.7k 92.7k 92.7k 92.8k 92.8k 92.8k 92.9k
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Samoa 1600 1600 1600 1600 1600 1600 1600 1600 1600 1600 ... 24k 24.5k 25k 25.5k 26k 26.5k 27k 27.6k 28.1k 28.6k
Yemen 1010 1010 1020 1020 1020 1020 1030 1030 1030 1030 ... 6170 6320 6470 6620 6780 6950 7120 7290 7470 7650
South Africa 1750 1730 1710 1690 1670 1590 1590 1720 1510 1470 ... 44.5k 45.1k 45.7k 46.4k 47k 47.6k 48.2k 48.8k 49.5k 50.1k
Zambia 533 535 536 537 539 539 541 543 543 545 ... 16.6k 17k 17.4k 17.8k 18.2k 18.6k 19k 19.4k 19.9k 20.3k
Zimbabwe 919 920 921 922 923 924 925 926 927 928 ... 9840 10.1k 10.3k 10.6k 10.8k 11.1k 11.4k 11.6k 11.9k 12.2k

195 rows × 301 columns

childm = pd.read_csv('child_mortality_0_5_year_olds_dying_per_1000_born.csv').set_index('country')
childm
1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 ... 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100
country
Afghanistan 469.0 469.0 469.0 469.0 469.0 469.0 470.0 470.0 470.0 470.0 ... 12.60 12.40 12.20 12.00 11.80 11.60 11.50 11.30 11.10 11.10
Angola 486.0 486.0 486.0 486.0 486.0 486.0 486.0 486.0 486.0 486.0 ... 17.70 17.50 17.30 17.10 17.00 16.80 16.60 16.40 16.30 16.30
Albania 375.0 375.0 375.0 375.0 375.0 375.0 375.0 375.0 375.0 375.0 ... 2.32 2.30 2.27 2.24 2.22 2.19 2.16 2.14 2.11 2.11
Andorra NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 0.86 0.84 0.83 0.81 0.80 0.79 0.78 0.77 0.76 0.76
UAE 434.0 434.0 434.0 434.0 434.0 434.0 434.0 434.0 434.0 434.0 ... 2.31 2.29 2.26 2.24 2.22 2.19 2.17 2.15 2.13 2.13
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Samoa 471.0 468.0 465.0 461.0 458.0 455.0 452.0 449.0 446.0 443.0 ... 3.73 3.70 3.67 3.65 3.62 3.59 3.56 3.54 3.51 3.51
Yemen 540.0 540.0 540.0 540.0 540.0 540.0 540.0 540.0 540.0 540.0 ... 14.30 14.10 13.80 13.60 13.40 13.20 13.00 12.80 12.60 12.60
South Africa 398.0 398.0 398.0 398.0 398.0 398.0 398.0 398.0 398.0 398.0 ... 10.50 10.40 10.20 10.10 9.95 9.82 9.68 9.55 9.42 9.42
Zambia 410.0 410.0 410.0 410.0 410.0 410.0 410.0 410.0 410.0 410.0 ... 12.50 12.30 12.20 12.10 11.90 11.80 11.70 11.60 11.40 11.40
Zimbabwe 396.0 396.0 396.0 396.0 396.0 396.0 396.0 396.0 396.0 396.0 ... 14.60 14.50 14.40 14.30 14.20 14.10 13.90 13.80 13.70 13.70

197 rows × 301 columns

x = gdp.loc[['Sweden', 'Norway', 'Denmark'], '1921']
y = childm.loc[['Sweden', 'Norway', 'Denmark'], '1921']
x
country
Sweden     5200
Norway     4830
Denmark    6980
Name: 1921, dtype: object
y
country
Sweden     84.6
Norway     71.8
Denmark    91.3
Name: 1921, dtype: float64
plt.scatter(x, -np.log(y))
<matplotlib.collections.PathCollection at 0x71a5e9a5dbd0>
_images/45dd69e084a783cc7afd22df23ff1ca44a8dc0295512b7af8600b7a65908533b.png
len(gdp), len(childm)
(195, 197)
df = pd.DataFrame()
df['GDP'] = gdp.loc[:, '1921']
df['Child mortality'] = childm.loc[:, '1921']
df['Population'] = pop.loc[: ,'1921']
df
GDP Child mortality Population
country
Afghanistan 908 465.0 10.6M
Angola 776 465.0 2.91M
Albania 820 359.0 943k
Andorra 4870 NaN 5610
UAE 2520 416.0 57.3k
... ... ... ...
Samoa 2430 204.0 40.7k
Yemen 1420 517.0 3.49M
South Africa 2220 381.0 7.3M
Zambia 799 392.0 1.48M
Zimbabwe 731 379.0 1.56M

195 rows × 3 columns

df.plot.scatter(x='GDP', y='Child mortality')
<Axes: xlabel='GDP', ylabel='Child mortality'>
_images/f12fa89dfa0fe89ade053c744dacb91c5821b314b695ad1a2459c24bda4d3616.png

Update#

make numeric columns#

How to we translate

  • k -> \(10^3\)

  • M -> \(10^6\)

  • B -> \(10^9\)

in a systematic way?

Consider 3.g. 10.6M meaning \(10.6*10^6\)

A function that that takes the string and splits off the final character

def convert(x: str) -> float:
    factors = {'k': 10**3, 'M': 10**6, 'B': 10**9}
    try:
        if x[-1] in factors:
            return float(x[:-1]) * factors[x[-1]]
        else:
            return float(x)
    except TypeError:
        # do nothing
        return x
        
convert(2.0), convert('3k'), convert('4M'), convert('1B')
(2.0, 3000.0, 4000000.0, 1000000000.0)

The buildin function map applies a function on a sequence of values

for x in map(convert, [2.0, '3k', '4M', '1B']):
    print(x)
2.0
3000.0
4000000.0
1000000000.0

In a similar vein pandas has a map method that is applied to its members

df['Population'].map(convert)
country
Afghanistan     10600000.0
Angola           2910000.0
Albania           943000.0
Andorra             5610.0
UAE                57300.0
                   ...    
Samoa              40700.0
Yemen            3490000.0
South Africa     7300000.0
Zambia           1480000.0
Zimbabwe         1560000.0
Name: Population, Length: 195, dtype: float64

Apply to and repalce all

df = df.map(convert)

to let the population represent be proportional to the area of a circle in the plot we scale it further with the square root

df['Population'] = np.sqrt(df['Population'])
df
GDP Child mortality Population
country
Afghanistan 908.0 465.0 3255.764119
Angola 776.0 465.0 1705.872211
Albania 820.0 359.0 971.081871
Andorra 4870.0 NaN 74.899933
UAE 2520.0 416.0 239.374184
... ... ... ...
Samoa 2430.0 204.0 201.742410
Yemen 1420.0 517.0 1868.154169
South Africa 2220.0 381.0 2701.851217
Zambia 799.0 392.0 1216.552506
Zimbabwe 731.0 379.0 1248.999600

195 rows × 3 columns

fix,ax = plt.subplots()
df.plot.scatter(x='GDP', y='Child mortality', s='Population', ax=ax)
<Axes: xlabel='GDP', ylabel='Child mortality'>
_images/29414511ded29bc4df4cd56932cfebaf6f05b100bbac7b281f41fd2a4b330bbe.png
# scale some more
df['Population'] /= 10
df
GDP Child mortality Population
country
Afghanistan 908.0 465.0 325.576412
Angola 776.0 465.0 170.587221
Albania 820.0 359.0 97.108187
Andorra 4870.0 NaN 7.489993
UAE 2520.0 416.0 23.937418
... ... ... ...
Samoa 2430.0 204.0 20.174241
Yemen 1420.0 517.0 186.815417
South Africa 2220.0 381.0 270.185122
Zambia 799.0 392.0 121.655251
Zimbabwe 731.0 379.0 124.899960

195 rows × 3 columns

df.plot.scatter(x='GDP', y='Child mortality', s='Population')
<Axes: xlabel='GDP', ylabel='Child mortality'>
_images/72173f2ebd4b070059048aa28430d7f454a8613fed6342992b88f5f90845b1f4.png

To get some transparency the alpha keyword can be used.

df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5
)
<Axes: xlabel='GDP', ylabel='Child mortality'>
_images/e7f60a58c5860692cf756a505b5600dcb9c361c075f3a490829679462ce7fa7d.png

The original figure has logarithmic scales

df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
)
<Axes: xlabel='GDP', ylabel='Child mortality'>
_images/334b825f365b4233ce1aedcde4c7fe292a27859e9a0bb3cd99b8f0ad5ea8e143.png

Where are we in all this?

df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
)
df.loc[['Sweden']].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
)
<Axes: xlabel='GDP', ylabel='Child mortality'>
_images/334b825f365b4233ce1aedcde4c7fe292a27859e9a0bb3cd99b8f0ad5ea8e143.png _images/2624a51c75c593012fb5f84dd9a724b6068c344a1f251b616d9d90f732490512.png

As we get more advanced and attach more things it is wise to work with the Axes objects directly

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[['Sweden', 'USA']].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)
<Axes: xlabel='GDP', ylabel='Child mortality'>
_images/bb059d69a07dc322fccddde5efcad5c3391caacf5c4d19d4e258476951ad2330.png

The original has inverted y axis (so that we plot increasing health vs increasing wealth)

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[['Sweden', 'USA']].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)
ax.invert_yaxis()
_images/b827e2018b54ae07199324d721d87fa225e54493a3e46ce67606a2dfe68a3da1.png

Set some text labels for the selected countries, and background year

selected = ['Sweden', 'USA']

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[selected].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)
ax.invert_yaxis()
for country in selected:
    x = df.loc[country, 'GDP']
    y = df.loc[country, 'Child mortality']
    ax.text(1.1*x, y, country, bbox={'boxstyle': 'round', 'facecolor': 'wheat'})
ax.text(.5, .5, '1921', transform=ax.transAxes, fontsize=100, color='gray', alpha=0.25, ha='center', va='center')
Text(0.5, 0.5, '1921')
_images/810a9f16e13349710f856436da4106acd46176cc088b10812956b85559362d4b.png

Set values on the axis for the selected countries

selected = ['Sweden', 'USA']

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[selected].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)
ax.invert_yaxis()
for country in selected:
    x = df.loc[country, 'GDP']
    y = df.loc[country, 'Child mortality']
    ax.text(1.1*x, y, country, bbox={'boxstyle': 'round', 'facecolor': 'wheat'})
ax.text(.5, .5, '1921', transform=ax.transAxes, fontsize=100, color='gray', alpha=0.25, ha='center', va='center')

ax.set_xticks(df.loc[selected, 'GDP'], labels = [str(int(x)) for x in df.loc[selected, 'GDP']], rotation=90)
ax.set_yticks(df.loc[selected, 'Child mortality'], labels = [str(int(y)) for y in df.loc[selected, 'Child mortality']])
ax.grid(True)
_images/1fcbe1850a70b61b95bafab0dde466839dd55e396daeb371f886159d4659f4eb.png
Finally we put in a historical perspective so that all years fit in the graph
selected = ['Sweden', 'USA']

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[selected].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)

for country in selected:
    x = df.loc[country, 'GDP']
    y = df.loc[country, 'Child mortality']
    ax.text(1.1*x, y, country, bbox={'boxstyle': 'round', 'facecolor': 'wheat'})
ax.text(.5, .5, '1921', transform=ax.transAxes, fontsize=100, color='gray', alpha=0.25, ha='center', va='center')

ax.set_xticks(df.loc[selected, 'GDP'], labels = [str(int(x)) for x in df.loc[selected, 'GDP']], rotation=90)
ax.set_yticks(df.loc[selected, 'Child mortality'], labels = [str(int(y)) for y in df.loc[selected, 'Child mortality']])
ax.grid(True)

x_min = gdp.map(convert).min(axis=None)
x_max = gdp.map(convert).max(axis=None)
y_min = childm.map(convert).min(axis=None)
y_max = childm.map(convert).max(axis=None)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)

ax.invert_yaxis()
_images/f379e78911e9789f27dcb2a6126d728335cc3d43868723518ddc2bd9de6b8302.png

Challenges:

  • Make a movie of frames

  • Make a slider for the chosen year

Classes#

str1 = "Hello world!"
print(str1)
Hello world!
str1.upper()
"""
Implies that there is a upper method in the str class

class str:
     def upper(self):
         returns a new string replacing lower with upper case

"""
'\nImplies that there is a upper method in the str class\n\nclass str:\n     def upper(self):\n         returns a new string replacing lower with upper case\n\n'
str1.upper?
Signature: str1.upper()
Docstring: Return a copy of the string converted to uppercase.
Type:      builtin_function_or_method
str.upper
<method 'upper' of 'str' objects>
str.upper?
Signature: str.upper(self, /)
Docstring: Return a copy of the string converted to uppercase.
Type:      method_descriptor
str.upper(str1) # call the upper function of the str class with str1 as argument
'HELLO WORLD!'
str1.upper() # whatever class str1 belongs to, call its upper method with str1 as the first argument (usually declared with the name self)
'HELLO WORLD!'
# example
class Person:
    """
    Datatype for personal data
    """
    def __init__(self, given_name, surname):
        self.given_name = given_name
        self.surname = surname
    def __str__(self): # usually a nicely readable string representation of the object
        return f"Person: {self.given_name} {self.surname}"

    def __repr__(self): # usually a unique accurate representation of an object
        return f"Person('{self.given_name}', '{self.surname}')"

Person("Joe", "H.")
Person('Joe', 'H.')
Person('Joe', 'H.')
Person('Joe', 'H.')
Person?
Init signature: Person(given_name, surname)
Docstring:      Datatype for personal data
Type:           type
Subclasses:     
type(int)
type
int()
0
type(int())
int
Person('Jane', 'Smith')
Person('Jane', 'Smith')
p=Person('Jane', 'Smith')
type(p)
__main__.Person
dir(p)
['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'given_name',
 'surname']
p.given_name
'Jane'
p.surname
'Smith'
print(p)
Person: Jane Smith
str(p) # delegates to p.__str__()
'Person: Jane Smith'
p.__str__()
'Person: Jane Smith'
p.given_name = "John"
print(p)
Person: John Smith
p
Person('John', 'Smith')
class Person:
    """
    Datatype for personal data
    """
    number = 0 # class attribute
    def __init__(self, given_name, surname):
        self.given_name = given_name # instance attribute
        self.surname = surname
        self.__class__.number += 1
        
    def __str__(self): # usually a nicely readable string representation of the object
        return f"Person: {self.given_name} {self.surname}"

    def __repr__(self): # usually a unique accurate representation of an object
        return f"Person('{self.given_name}', '{self.surname}')"

    def display_person(self):
        print(f"{self.surname}, {self.given_name}")
p1 = Person("Jane", "Smith")
p2 = Person("John", "Smith") # two different instances of class Person
p1.display_person()
p2.display_person()
Smith, Jane
Smith, John
p1.number
2
p2.number
2
Person.number
2
chosen_attribute = "given_name"
p1.__getattribute__(chosen_attribute)
'Jane'
# Addition operator +

2 + 3
5
"ab" + "cd"
'abcd'
[1] + [2]
[1, 2]
# Example: complex number

class MyComplex:
    """
    Implements complex numbers in Python
    """
    def __init__(self, re=0, im=0):
        self.re = float(re)
        self.im = float(im)

    def __repr__(self):
        return f"MyComplex({self.re}, {self.im})"

    def __str__(self):
        return f"{self.re} + {self.im}i"

    # def __add__(left, right)
    def __add__(self, other):
        new_z = MyComplex(self.re + other.re, self.im + other.im)
        return new_z
        
    
z = MyComplex()
z
MyComplex(0.0, 0.0)
z.re
0.0
z.im
0.0
str(z)
'0.0 + 0.0i'
z1 = MyComplex(1, 2)
z2 = MyComplex(3, 4)
z1 + z2
MyComplex(4.0, 6.0)
print(z1 + z2)
4.0 + 6.0i

Inheritance#

class Employee(Person):
    "An employee class, derives from Person"
    number = 0
    def __init__(self, given_name, surname, salary):
        #Person.__init__(self, given_name, surname)
        super().__init__(given_name, surname)
        self.salary = salary
e = Employee("Olav", "Vahtras", 0)
e.display_person()
Vahtras, Olav
e.number
1
dir(e)[0]
'__class__'
e.__class__
__main__.Employee

Testing#

assert True
assert False
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[460], line 1
----> 1 assert False

AssertionError: 
%%file test_my_math.py
from my_math import my_add
from pytest import approx

def test_my_add():
   assert my_add(1, 1) == 2

def test_my_add_floats():
    assert my_add(.1, .2) == approx(.3)
    
Overwriting test_my_math.py
%%file my_math.py
def my_add(x, y):
    return x - y
Overwriting my_math.py
!pytest -v test_my_math.py
============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 2 items                                                              

test_my_math.py::test_my_add FAILED                                      [ 50%]
test_my_math.py::test_my_add_floats FAILED                               [100%]

=================================== FAILURES ===================================
_________________________________ test_my_add __________________________________

    def test_my_add():
>      assert my_add(1, 1) == 2
E      assert 0 == 2
E       +  where 0 = my_add(1, 1)

test_my_math.py:5: AssertionError
______________________________ test_my_add_floats ______________________________

    def test_my_add_floats():
>       assert my_add(.1, .2) == approx(.3)
E       assert -0.1 == 0.3 ± 3.0e-07
E         
E         comparison failed
E         Obtained: -0.1
E         Expected: 0.3 ± 3.0e-07

test_my_math.py:8: AssertionError
=========================== short test summary info ============================
FAILED test_my_math.py::test_my_add - assert 0 == 2
FAILED test_my_math.py::test_my_add_floats - assert -0.1 == 0.3 ± 3.0e-07
============================== 2 failed in 0.08s ===============================
%%file my_math.py
def my_add(x, y):
    return x + y
Overwriting my_math.py
!pytest -v test_my_math.py
============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 2 items                                                              

test_my_math.py::test_my_add PASSED                                      [ 50%]
test_my_math.py::test_my_add_floats PASSED                               [100%]

============================== 2 passed in 0.02s ===============================
%%file test_leap.py
from leap import is_leap_year
def test_leap_year():
    assert is_leap_year(2000) == True
    assert is_leap_year(1999) == False
    assert is_leap_year(1998) == False
    assert is_leap_year(1996) == True
    assert is_leap_year(1900) == False
    assert is_leap_year(1800) == False
    assert is_leap_year(1600) == True
Overwriting test_leap.py
%%file leap.py
def is_leap_year(year):
    """
    Returns True for leap years
    """
    if year % 4 == 0:
        return True
    else:
        return False
Overwriting leap.py
!pytest test_leap.py
============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 1 item                                                               

test_leap.py F                                                           [100%]

=================================== FAILURES ===================================
________________________________ test_leap_year ________________________________

    def test_leap_year():
        assert is_leap_year(2000) == True
        assert is_leap_year(1999) == False
        assert is_leap_year(1998) == False
        assert is_leap_year(1996) == True
>       assert is_leap_year(1900) == False
E       assert True == False
E        +  where True = is_leap_year(1900)

test_leap.py:7: AssertionError
=========================== short test summary info ============================
FAILED test_leap.py::test_leap_year - assert True == False
============================== 1 failed in 0.07s ===============================
%%file leap.py
def is_leap_year(year):
    """
    Returns True for leap years
    """
    if year % 100 == 0:
        if year % 400 == 0:
            return True
        else:
            return False
    else:
        if year % 4 == 0:
            return True
        else:
            return False
Overwriting leap.py
!pytest -v test_leap.py
============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 1 item                                                               

test_leap.py::test_leap_year PASSED                                      [100%]

============================== 1 passed in 0.02s ===============================
# cleanup / improve code
%%file leap.py
def is_leap_year(year):
    """
    Returns True for leap years
    """
    if year % 100 == 0:
        return year % 400 == 0
    else:
        return year % 4 == 0
Overwriting leap.py
!pytest -v
============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 8 items                                                              

git_demo/test_savings.py::test_one_year PASSED                           [ 12%]
git_demo/test_savings.py::test_two_years PASSED                          [ 25%]
git_demo/test_savings.py::test_two_years_only_inital_deposit PASSED      [ 37%]
test_leap.py::test_leap_year PASSED                                      [ 50%]
test_my_math.py::test_my_add PASSED                                      [ 62%]
test_my_math.py::test_my_add_floats PASSED                               [ 75%]
test_timestamps.py::test_1 PASSED                                        [ 87%]
test_timestamps.py::test_6 FAILED                                        [100%]

=================================== FAILURES ===================================
____________________________________ test_6 ____________________________________

    def test_6():
        """
        >>> sum_timestamps(['6:35:32', '2:45:48', '40:10'])
        '10:01:30'
        """
>       assert timestamps(['6:35:32', '2:45:48', '40:10']) == '10:01:30'
E       TypeError: 'module' object is not callable

test_timestamps.py:13: TypeError
=========================== short test summary info ============================
FAILED test_timestamps.py::test_6 - TypeError: 'module' object is not callable
========================= 1 failed, 7 passed in 0.54s ==========================
%%file test_leap.py
from leap import is_leap_year
import pytest

TEST_DATA = [
    (2000, True),
    (1999, False),
    (1998, False),
    (1996, True),
    (1900, False),
    (1800, False),
    (1600, True),
]

@pytest.mark.parametrize('year,expected', TEST_DATA)
def test_leap_year(year, expected):
    assert is_leap_year(year) == expected
Overwriting test_leap.py
!pytest -v
============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 14 items                                                             

git_demo/test_savings.py::test_one_year PASSED                           [  7%]
git_demo/test_savings.py::test_two_years PASSED                          [ 14%]
git_demo/test_savings.py::test_two_years_only_inital_deposit PASSED      [ 21%]
test_leap.py::test_leap_year[2000-True] PASSED                           [ 28%]
test_leap.py::test_leap_year[1999-False] PASSED                          [ 35%]
test_leap.py::test_leap_year[1998-False] PASSED                          [ 42%]
test_leap.py::test_leap_year[1996-True] PASSED                           [ 50%]
test_leap.py::test_leap_year[1900-False] PASSED                          [ 57%]
test_leap.py::test_leap_year[1800-False] PASSED                          [ 64%]
test_leap.py::test_leap_year[1600-True] PASSED                           [ 71%]
test_my_math.py::test_my_add PASSED                                      [ 78%]
test_my_math.py::test_my_add_floats PASSED                               [ 85%]
test_timestamps.py::test_1 PASSED                                        [ 92%]
test_timestamps.py::test_6 FAILED                                        [100%]

=================================== FAILURES ===================================
____________________________________ test_6 ____________________________________

    def test_6():
        """
        >>> sum_timestamps(['6:35:32', '2:45:48', '40:10'])
        '10:01:30'
        """
>       assert timestamps(['6:35:32', '2:45:48', '40:10']) == '10:01:30'
E       TypeError: 'module' object is not callable

test_timestamps.py:13: TypeError
=========================== short test summary info ============================
FAILED test_timestamps.py::test_6 - TypeError: 'module' object is not callable
========================= 1 failed, 13 passed in 0.47s =========================
# coverage
!conda install pytest-cov -y
Channels:
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
    current version: 25.1.1
    latest version: 25.3.1

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.
!pytest test_leap.py --cov leap
============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 7 items                                                              

test_leap.py .......                                                     [100%]

---------- coverage: platform linux, python 3.13.2-final-0 -----------
Name      Stmts   Miss  Cover
-----------------------------
leap.py       4      0   100%
-----------------------------
TOTAL         4      0   100%


============================== 7 passed in 0.06s ===============================
!pytest test_leap.py --cov leap --cov-report=html
============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 7 items                                                              

test_leap.py .......                                                     [100%]

---------- coverage: platform linux, python 3.13.2-final-0 -----------
Coverage HTML written to dir htmlcov


============================== 7 passed in 0.09s ===============================

doctest#

%%file leap.py
def is_leap_year(year):
    """
    Returns True for leap years

    >>> is_leap_year(2000)
    True
    >>> is_leap_year(1900)
    False
    
    """
    if year % 100 == 0:
        return year % 400 == 0
    else:
        return year % 4 == 0
Overwriting leap.py
!python -m doctest leap.py -v
Trying:
    is_leap_year(2000)
Expecting:
    True
ok
Trying:
    is_leap_year(1900)
Expecting:
    False
ok
1 item had no tests:
    leap
1 item passed all tests:
   2 tests in leap.is_leap_year
2 tests in 2 items.
2 passed.
Test passed.
# indentation error
if True:
 print("Hello")
  print("Hello")
  Cell In[481], line 4
    print("Hello")
    ^
IndentationError: unexpected indent
# indentation 2-spaces
if True:
  print("Hello")
  print("Hello")
Hello
Hello
# indentation 4-spaces (PEP8 convention)
if True:
    print("Hello")
    print("Hello")
Hello
Hello

Version control#

!git --help
usage: git [-v | --version] [-h | --help] [-C <path>] [-c <name>=<value>]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p | --paginate | -P | --no-pager] [--no-replace-objects] [--no-lazy-fetch]
           [--no-optional-locks] [--no-advice] [--bare] [--git-dir=<path>]
           [--work-tree=<path>] [--namespace=<name>] [--config-env=<name>=<envvar>]
           <command> [<args>]

These are common Git commands used in various situations:

start a working area (see also: git help tutorial)
   clone     Clone a repository into a new directory
   init      Create an empty Git repository or reinitialize an existing one

work on the current change (see also: git help everyday)
   add       Add file contents to the index
   mv        Move or rename a file, a directory, or a symlink
   restore   Restore working tree files
   rm        Remove files from the working tree and from the index

examine the history and state (see also: git help revisions)
   bisect    Use binary search to find the commit that introduced a bug
   diff      Show changes between commits, commit and working tree, etc
   grep      Print lines matching a pattern
   log       Show commit logs
   show      Show various types of objects
   status    Show the working tree status

grow, mark and tweak your common history
   branch    List, create, or delete branches
   commit    Record changes to the repository
   merge     Join two or more development histories together
   rebase    Reapply commits on top of another base tip
   reset     Reset current HEAD to the specified state
   switch    Switch branches
   tag       Create, list, delete or verify a tag object signed with GPG

collaborate (see also: git help workflows)
   fetch     Download objects and refs from another repository
   pull      Fetch from and integrate with another repository or a local branch
   push      Update remote refs along with associated objects

'git help -a' and 'git help -g' list available subcommands and some
concept guides. See 'git help <command>' or 'git help <concept>'
to read about a specific subcommand or concept.
See 'git help git' for an overview of the system.

configuration#

!git config --global user.name "Olav Vahtras"
!git config --global user.email vahtras@kth.se
!more ~/.gitconfig
[user]
	email = vahtras@kth.se
	name = Olav Vahtras
%cd ~/bb1000
/home/bb1000-vt25/bb1000
%mkdir git_demo
%cd git_demo/
/home/bb1000-vt25/bb1000/git_demo
#initialize current folder for git
! git init .
hint: Using 'master' as the name for the initial branch. This default branch name
hint: is subject to change. To configure the initial branch name to use in all
hint: of your new repositories, which will suppress this warning, call:
hint:
hint: 	git config --global init.defaultBranch <name>
hint:
hint: Names commonly chosen instead of 'master' are 'main', 'trunk' and
hint: 'development'. The just-created branch can be renamed via this command:
hint:
hint: 	git branch -m <name>
Initialized empty Git repository in /home/bb1000-vt25/bb1000/git_demo/.git/
!ls -a
.  ..  .git
!git status
On branch master

No commits yet

nothing to commit (create/copy files and use "git add" to track)
# work...
%%file savings.py
# Calculate retirement savings

def savings_calculator(amount, interest):
    final_amount = amount + amount*interest
    return final_amount
Overwriting savings.py
%%file test_savings.py
import savings

def test_one_year():
    initial_amount = 500
    annual_interest = .05
    expected_final = 525
    calculated_final = savings.savings_calculator(initial_amount, annual_interest)
    assert calculated_final == expected_final
Overwriting test_savings.py
!git status
On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	__pycache__/
	savings.py
	test_savings.py

nothing added to commit but untracked files present (use "git add" to track)
# create .gitignore file to list files to ignore
%%file .gitignore
*.pyc
Writing .gitignore
!git status
On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.gitignore
	savings.py
	test_savings.py

nothing added to commit but untracked files present (use "git add" to track)
# save changes to the cache
!git add .
!git status
On branch master

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)
	new file:   .gitignore
	new file:   savings.py
	new file:   test_savings.py
!git commit -m "Initial commit"
[master (root-commit) a324f3f] Initial commit
 3 files changed, 14 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 savings.py
 create mode 100644 test_savings.py
!git status
On branch master
nothing to commit, working tree clean
!git log
commit a324f3f127df2eb866b3f870901ab0c91baf41ff (HEAD -> master)
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:50:43 2025 +0200

    Initial commit
# new changes to code
%%file savings.py
# Calculate retirement savings

def savings_calculator(amount, interest, years=1):
    final_amount = amount*(1 + interest)**years
    return final_amount
Overwriting savings.py
!git status
On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   savings.py

no changes added to commit (use "git add" and/or "git commit -a")
# add all updated files to the cache
!git add -u
!git status
On branch master
Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   savings.py
#save changes to repository
!git commit -m 'Allow for several years of savings'
[master 0075416] Allow for several years of savings
 1 file changed, 3 insertions(+), 3 deletions(-)
!git status
On branch master
nothing to commit, working tree clean
!git log
commit 0075416fafd7a9f5432bfb9bcd0b7461221215ce (HEAD -> master)
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:55:23 2025 +0200

    Allow for several years of savings

commit a324f3f127df2eb866b3f870901ab0c91baf41ff
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:50:43 2025 +0200

    Initial commit
!git log --oneline
0075416 (HEAD -> master) Allow for several years of savings
a324f3f Initial commit
# new changes
%%file savings.py
# Calculate retirement savings

def savings_calculator(amount, interest, years=1):
    final_amount = 0
    for year in range(years):
        final_amount = (amount + final_amount)*(1 + interest)
    return final_amount
Overwriting savings.py
!git status
On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   savings.py

no changes added to commit (use "git add" and/or "git commit -a")
!git diff
diff --git a/savings.py b/savings.py
index 715f219..46443a6 100644
--- a/savings.py
+++ b/savings.py
@@ -1,5 +1,7 @@
 # Calculate retirement savings
 
 def savings_calculator(amount, interest, years=1):
-    final_amount = amount*(1 + interest)**years
-    return final_amount(bb1000) bb1000-vt25@bat:~/bb1000/git_demo_backup$ 
+    final_amount = 0
+    for year in range(years):
+        final_amount = (amount + final_amount)*(1 + interest)
+    return final_amount
!git add -u
!git status
On branch master
Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   savings.py
!git commit -m "Annual deposits with for loop"
[master ea52645] Annual deposits with for loop
 1 file changed, 4 insertions(+), 2 deletions(-)
!git status
On branch master
nothing to commit, working tree clean
!git log
commit ea526456f52d4a6e361f46ec799a7748ea257469 (HEAD -> master)
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 22:00:38 2025 +0200

    Annual deposits with for loop

commit 0075416fafd7a9f5432bfb9bcd0b7461221215ce
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:55:23 2025 +0200

    Allow for several years of savings

commit a324f3f127df2eb866b3f870901ab0c91baf41ff
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:50:43 2025 +0200

    Initial commit
!git log --oneline
ea52645 (HEAD -> master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit
# have a look at the previous version
!git checkout 0075416
Note: switching to '0075416'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 0075416 Allow for several years of savings
!git log --oneline
0075416 (HEAD) Allow for several years of savings
a324f3f Initial commit
!git log --oneline --all
ea52645 (master) Annual deposits with for loop
0075416 (HEAD) Allow for several years of savings
a324f3f Initial commit
!git switch master
Previous HEAD position was 0075416 Allow for several years of savings
Switched to branch 'master'
!git log --oneline --all
ea52645 (HEAD -> master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

Branches#

Two lines of development

  • a script that runs from the command line

  • variation of the years that I want to deposit

two git branches

!git branch script
!git branch varying-deposits
!git log --oneline
ea52645 (HEAD -> master, varying-deposits, script) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit
!git switch script
Switched to branch 'script'
!git log --oneline
ea52645 (HEAD -> script, varying-deposits, master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit
%%file savings.py
# Calculate retirement savings
import sys

def savings_calculator(amount, interest, years=1):
    final_amount = 0
    for year in range(years):
        final_amount = round((amount + final_amount)*(1 + interest), 2)
    return final_amount

if __name__ == "__main__":
    if len(sys.argv) == 3:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        final = savings_calculator(amount, interest)
        years = 1
    elif len(sys.argv) == 4:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        years = int(sys.argv[3])
        final = savings_calculator(amount, interest, years)
    else:
        print(f"Usage: {sys.argv[0]} amount interest [years]")
        exit()

    print(f"Savings after {years} years: {final}")
Overwriting savings.py
!git add -u
!git commit -m "Include command-line arguments"
[script 65a3b2d] Include command-line arguments
 1 file changed, 19 insertions(+), 1 deletion(-)
!git log --oneline --all
65a3b2d (HEAD -> script) Include command-line arguments
ea52645 (varying-deposits, master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit
!git switch varying-deposits
Switched to branch 'varying-deposits'
!git log --oneline --all
65a3b2d (script) Include command-line arguments
ea52645 (HEAD -> varying-deposits, master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit
# new changes related to years of depositing
%%file savings.py
# Calculate retirement savings

def savings_calculator(amount, interest, years=1, stop_deposit=None):
    final_amount = 0
    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
        if year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)
    return final_amount
Overwriting savings.py
!git diff
diff --git a/savings.py b/savings.py
index 46443a6..d1611da 100644
--- a/savings.py
+++ b/savings.py
@@ -1,7 +1,13 @@
 # Calculate retirement savings
 
-def savings_calculator(amount, interest, years=1):
+def savings_calculator(amount, interest, years=1, stop_deposit=None):
     final_amount = 0
+    if stop_deposit is None:
+        stop_deposit = years
+
     for year in range(years):
-        final_amount = (amount + final_amount)*(1 + interest)
+        if year < stop_deposit:
+            final_amount = (amount + final_amount)*(1 + interest)
+        else:
+            final_amount = final_amount*(1 + interest)
     return final_amount
!git add -u
!git commit -m "Changes for optional deposit years"
[varying-deposits 335928a] Changes for optional deposit years
 1 file changed, 8 insertions(+), 2 deletions(-)
!git log --oneline --all
335928a (HEAD -> varying-deposits) Changes for optional deposit years
65a3b2d (script) Include command-line arguments
ea52645 (master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit
# include changes in script to master
!git switch master
Switched to branch 'master'
!git merge script
Updating ea52645..65a3b2d
Fast-forward
 savings.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)
!git log --oneline --all
335928a (varying-deposits) Changes for optional deposit years
65a3b2d (HEAD -> master, script) Include command-line arguments
ea52645 Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit
!git merge varying-deposits
Auto-merging savings.py
CONFLICT (content): Merge conflict in savings.py
Automatic merge failed; fix conflicts and then commit the result.
!git status
On branch master
You have unmerged paths.
  (fix conflicts and run "git commit")
  (use "git merge --abort" to abort the merge)

Unmerged paths:
  (use "git add <file>..." to mark resolution)
	both modified:   savings.py

no changes added to commit (use "git add" and/or "git commit -a")

The file savings.py in a state of conflict:

# Calculate retirement savings
import sys

def savings_calculator(amount, interest, years=1, stop_deposit=None):
    final_amount = 0
    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
<<<<<<< HEAD
        final_amount = round((amount + final_amount)*(1 + interest), 2)
    return final_amount

if __name__ == "__main__":
    if len(sys.argv) == 3:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        final = savings_calculator(amount, interest)
        years = 1
    elif len(sys.argv) == 4:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        years = int(sys.argv[3])
        final = savings_calculator(amount, interest, years)
    else:
        print(f"Usage: {sys.argv[0]} amount interest [years]")
        exit()

    print(f"Savings after {years} years: {final}")

        
=======
        if year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)
    return final_amount
>>>>>>> varying-deposits

fix merge

%%file savings.py
# Calculate retirement savings
import sys

def savings_calculator(amount, interest, years=1, stop_deposit=None):
    final_amount = 0
    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
        if year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)
    return final_amount

if __name__ == "__main__":
    if len(sys.argv) == 3:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        final = savings_calculator(amount, interest)
        years = 1
    elif len(sys.argv) == 4:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        years = int(sys.argv[3])
        final = savings_calculator(amount, interest, years)
    else:
        print(f"Usage: {sys.argv[0]} amount interest [years]")
        exit()

    print(f"Savings after {years} years: {final}")

    
Overwriting savings.py
!git status
On branch master
You have unmerged paths.
  (fix conflicts and run "git commit")
  (use "git merge --abort" to abort the merge)

Unmerged paths:
  (use "git add <file>..." to mark resolution)
	both modified:   savings.py

no changes added to commit (use "git add" and/or "git commit -a")
%%file savings.py
# Calculate retirement savings
import sys

def savings_calculator(amount, interest, years=1, stop_deposit=None):
    final_amount = 0
    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
        if year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)
    return final_amount

if __name__ == "__main__":
    if len(sys.argv) == 3:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        final = savings_calculator(amount, interest)
        years = 1
    elif len(sys.argv) == 4:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        years = int(sys.argv[3])
        final = savings_calculator(amount, interest, years)
    else:
        print(f"Usage: {sys.argv[0]} amount interest [years]")
        exit()

    print(f"Savings after {years} years: {final}")
Overwriting savings.py
!git add -u
!git commit -m "fix conflicts"
[master a71a30b] fix conflicts
!git status
On branch master
nothing to commit, working tree clean

Remote repositories#

!git remote add kth-github git@gits-15.sys.kth.se:BB1000/git_demo.git
!git remote
kth-github
!git remote -v
kth-github	git@gits-15.sys.kth.se:BB1000/git_demo.git (fetch)
kth-github	git@gits-15.sys.kth.se:BB1000/git_demo.git (push)
!git push kth-github master
Enumerating objects: 20, done.
Counting objects: 100% (20/20), done.
Delta compression using up to 12 threads
Compressing objects: 100% (19/19), done.
Writing objects: 100% (20/20), 2.50 KiB | 2.50 MiB/s, done.
Total 20 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (3/3), done.
To gits-15.sys.kth.se:BB1000/git_demo.git
 + 56b59e6...a71a30b master -> master (forced update)
Enumerating objects: 23, done.
Counting objects: 100% (23/23), done.
Delta compression using up to 12 threads
Compressing objects: 100% (22/22), done.
Writing objects: 100% (23/23), 2.66 KiB | 680.00 KiB/s, done.
Total 23 (delta 6), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (6/6), done.
To gits-15.sys.kth.se:BB1000/git_demo.git
 * [new branch]      master -> master
!git log --oneline --all
a71a30b (HEAD -> master, kth-github/master) fix conflicts
335928a (varying-deposits) Changes for optional deposit years
65a3b2d (script) Include command-line arguments
ea52645 Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit
%cd /tmp
!git clone git@gits-15.sys.kth.se:BB1000/git_demo
Cloning into 'git_demo'...
remote: Enumerating objects: 23, done.
remote: Counting objects: 100% (23/23), done.
remote: Compressing objects: 100% (16/16), done.
remote: Total 23 (delta 6), reused 23 (delta 6), pack-reused 0
Receiving objects: 100% (23/23), done.
Resolving deltas: 100% (6/6), done.
%cd git_demo
/tmp/git_demo
!git log --oneline --all
5ee6af4 (HEAD -> master, origin/master, origin/HEAD) fix conflicts
25bbab9 Changes for optional deposit years
a884958 Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit
%cd /tmp/git_demo
/tmp/git_demo
# on second computer/ second user
!git log --oneline --all
85ebcbf (HEAD -> master) Define start end stop deposit years
5ee6af4 (origin/master, origin/HEAD) fix conflicts
25bbab9 Changes for optional deposit years
a884958 Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit
!git remote -v
origin	git@gits-15.sys.kth.se:BB1000/git_demo (fetch)
origin	git@gits-15.sys.kth.se:BB1000/git_demo (push)
!git push origin master
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 12 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 400 bytes | 400.00 KiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.
To gits-15.sys.kth.se:BB1000/git_demo
   5ee6af4..85ebcbf  master -> master
# back to first computer
%cd ~/bb1000/git_demo
/home/bb1000-vt25/bb1000/git_demo
#update local information
# two possibilities: git pull/ git fetch and git merge
!git remote -v
kth-github	git@gits-15.sys.kth.se:BB1000/git_demo.git (fetch)
kth-github	git@gits-15.sys.kth.se:BB1000/git_demo.git (push)
!git log --oneline --all
5ee6af4 (HEAD -> master, kth-github/master) fix conflicts
25bbab9 (varying-deposits) Changes for optional deposit years
a884958 (script) Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit
!git fetch --all # update our local information about all remote
remote: Enumerating objects: 5, done.
remote: Counting objects: 100% (5/5), done.
remote: Compressing objects: 100% (2/2), done.
remote: Total 3 (delta 1), reused 3 (delta 1), pack-reused 0
Unpacking objects: 100% (3/3), 380 bytes | 380.00 KiB/s, done.
From gits-15.sys.kth.se:BB1000/git_demo
   5ee6af4..85ebcbf  master     -> kth-github/master
!git log --oneline --all
85ebcbf (kth-github/master, kth-github/HEAD) Define start end stop deposit years
5ee6af4 (HEAD -> master) fix conflicts
25bbab9 (varying-deposits) Changes for optional deposit years
a884958 (script) Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit
!git merge kth-github/master
Updating 5ee6af4..85ebcbf
Fast-forward
 savings.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)
!git log --oneline --all
85ebcbf (HEAD -> master, kth-github/master, kth-github/HEAD) Define start end stop deposit years
5ee6af4 fix conflicts
25bbab9 (varying-deposits) Changes for optional deposit years
a884958 (script) Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit
# include new updates with git pull
!git pull
remote: Enumerating objects: 5, done.
remote: Counting objects: 100% (5/5), done.
remote: Compressing objects: 100% (3/3), done.
remote: Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
Unpacking objects: 100% (3/3), 395 bytes | 395.00 KiB/s, done.
From gits-15.sys.kth.se:BB1000/git_demo
   85ebcbf..cbce447  master     -> kth-github/master
There is no tracking information for the current branch.
Please specify which branch you want to merge with.
See git-pull(1) for details.

    git pull <remote> <branch>

If you wish to set tracking information for this branch you can do so with:

    git branch --set-upstream-to=kth-github/<branch> master
!git pull kth-github master
From gits-15.sys.kth.se:BB1000/git_demo
 * branch            master     -> FETCH_HEAD
Updating 85ebcbf..cbce447
Fast-forward
 savings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
!git log --oneline --all
cbce447 (HEAD -> master, kth-github/master, kth-github/HEAD) Update savings.py: round output to two decimals
85ebcbf Define start end stop deposit years
5ee6af4 fix conflicts
25bbab9 (varying-deposits) Changes for optional deposit years
a884958 (script) Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit

update: plot savings history#

# %load git_demo/savings.py
# Calculate retirement savings
import sys
import matplotlib.pyplot as plt

def savings_calculator(amount, interest, years=1, start_deposit=None, stop_deposit=None, plot=False):
    final_amount = 0
    savings_by_year = []
    if start_deposit is None:
        start_deposit = 0

    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
        if start_deposit <= year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)

        final_amount =  round(final_amount, 2)
        #print(year, final_amount)
        savings_by_year.append(final_amount)

    if plot:
        plt.plot(savings_by_year)

        
    return final_amount

Compare early career savings vs late career savings (10000 per year)

final = savings_calculator(10000, .1, years=40, start_deposit=0, stop_deposit=10, plot=True); print("Total deposits", 10000*10, ", final savings", final)
final = savings_calculator(10000, .1, years=40, start_deposit=10, stop_deposit=40, plot=True); print("Total deposits", 10000*30, ", final savings", final)

plt.show()
Total deposits 100000 , final savings 3059084.15
Total deposits 300000 , final savings 1809434.33
_images/cb2ee41e9d23172c900c73f64d2d5a83c1aa8bb209cd057f2d528251d71b608f.png

Conclusion: the late saver who puts down more money never catches up to the early saver, due to compound intererst over several years

Advanced topics#

Decorators#

# function with arbitrary arguments
#example
print('Hello')
print('Hello', 'world')
Hello
Hello world
def f(*args):
    print(args)
f()
()
f('Hello')
('Hello',)
f('Hello', 'world')
('Hello', 'world')
def g(*args, **kwargs):
    print('In function g:')
    print(args)
    print(kwargs)
g()
In function g:
()
{}
g('Hello', who='World')
In function g:
('Hello',)
{'who': 'World'}
def h(*args, **kwargs):
    print('in function h:', args, kwargs)
    g(*args, **kwargs)
h('Hello', who='World')
in function h: ('Hello',) {'who': 'World'}
In function g:
('Hello',)
{'who': 'World'}
# example timing
import math
import time

def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)
    
t1 = time.time()
slow_math(2)
t2 = time.time()
print("Time used in slow_math", round(t2-t1, 1))
Time used in slow_math 1.0
t1 = time.time()
slow_math(3)
t2 = time.time()
print("Time used in slow_math", round(t2-t1, 1))
Time used in slow_math 1.0
# with a decorator - function that takes a function as input, and returns a function

def timeme(function):
    def timed_function(*args, **kwargs):
        t1 = time.time()
        return_value = function(*args, **kwargs)
        t2 = time.time()
        print("Time used", round(t2-t1, 1))
        return return_value
    return timed_function
timeme(slow_math)
<function __main__.timeme.<locals>.timed_function(*args, **kwargs)>
slow_math_timed = timeme(slow_math)
slow_math_timed(4)
Time used 1.0
2.0
# decorator notation

@timeme  # equivalent to slow_math=timeme(slow_math), redefinition
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)
slow_math(2)
Time used 1.0
1.4142135623730951
def debug(function):
    def wrapped(*args, **kwargs):
        print(function.__name__, 'called with', args, kwargs)
        return_value = function(*args, **kwargs)
        print(function.__name__, 'returns', return_value)
        return return_value
    return wrapped
@debug  
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)
slow_math(2)
slow_math called with (2,) {}
slow_math returns 1.4142135623730951
1.4142135623730951
# combination of decorators
@timeme
@debug  
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)
slow_math(2)
slow_math called with (2,) {}
slow_math returns 1.4142135623730951
Time used 1.0
1.4142135623730951
# combination of decorators
@debug  
@timeme
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)
slow_math(2)
timed_function called with (2,) {}
Time used 1.0
timed_function returns 1.4142135623730951
1.4142135623730951
# combinatin of decorators with wraps
from functools import wraps

def timeme(function):
    @wraps(function)
    def timed_function(*args, **kwargs):
        t1 = time.time()
        return_value = function(*args, **kwargs)
        t2 = time.time()
        print("Time used in", function.__name__, round(t2-t1, 1))
        return return_value
    return timed_function

def debug(function):
    @wraps(function)
    def wrapped(*args, **kwargs):
        print(function.__name__, 'called with', args, kwargs)
        return_value = function(*args, **kwargs)
        print(function.__name__, 'returns', return_value)
        return return_value
    return wrapped
# combinatin of decorators with wraps

@timeme
@debug
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)

slow_math(2)
slow_math called with (2,) {}
slow_math returns 1.4142135623730951
Time used in slow_math 1.0
1.4142135623730951
## Context managers

#t1 = time.time()
#slow_math(2)
#t2 = time.time()
#print("Time used in slow_math", round(t2-t1, 1))
#example: opening files
with open('sample.txt', 'w') as f:
    f.write('hello')
class TimeMe:
    def __enter__(self):
        self.t1 = time.time()

    def __exit__(self, *args):
        self.t2 = time.time()
        print("Time used in block", round(self.t2 - self.t1, 1))
        
with TimeMe():
    time.sleep(2)
Time used in block 2.0

Iterators#

li = ['hello', 'there', 'world']
# primitve iteration
for index in range(len(li)):
    print(index)
    print(li[index])
    
0
hello
1
there
2
world
# pythonic iteration
for member in li:
    print(member)
hello
there
world
for member in enumerate(li):
    print(member)
(0, 'hello')
(1, 'there')
(2, 'world')
for member in enumerate(li, start=1):
    print(member)
(1, 'hello')
(2, 'there')
(3, 'world')
#dictionaries
dictionary = {'a':1, 'b':2}
for k in dictionary:
    print(k, dictionary[k])
a 1
b 2
for k in dictionary.items():
    print(k)
('a', 1)
('b', 2)
# internall in the for loop
dir(dictionary)
['__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']
dictionary.__iter__()
<dict_keyiterator at 0x7a37888ef650>
dir(dictionary.__iter__())
['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__length_hint__',
 '__lt__',
 '__ne__',
 '__new__',
 '__next__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']
dictionary.__iter__().__next__() # get the first value in the sequence
'a'
dictionary_iterator = dictionary.__iter__()  # same as iter(dictionary)
next(dictionary_iterator)
'a'
next(dictionary_iterator)
'b'
next(dictionary_iterator)
---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
Cell In[168], line 1
----> 1 next(dictionary_iterator)

StopIteration: 
# internally what happens during
for k in dictionary:
    print(k)
a
b

Generators#

def myfunction(n):
    i = 0
    return i
myfunction(3)
0
def myrange(n):
    i = 0
    while i < n:
        yield i
        i += 1
myrange(3)
<generator object myrange at 0x7a37888f9cc0>
for element in myrange(3):
    print(element)
0
1
2
# internally
myrange_iterator = myrange(3)
next(myrange_iterator)
0
next(myrange_iterator)
1
next(myrange_iterator)
2
next(myrange_iterator)
---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
Cell In[190], line 1
----> 1 next(myrange_iterator)

StopIteration: 
#Example: list the first n Fibonacci numbers:  0 1 1 2 3 5 8...

def fib(n):
    counter = 0
    a = 0
    b = 1
    while counter < n:
        yield a
        # b -> a
        # a + b -> b
        a, b = b, a + b
        counter += 1
    
    
for fibnum in fib(5):
    print(fibnum)
0
1
1
2
3
list(fib(20))
[0,
 1,
 1,
 2,
 3,
 5,
 8,
 13,
 21,
 34,
 55,
 89,
 144,
 233,
 377,
 610,
 987,
 1597,
 2584,
 4181]
# Example: search lines in a file for something
%%file data.txt
hello c++
hello fortran
hello python
Writing data.txt
!grep python data.txt
hello python
def grep(search_string, filename):
    with open(filename) as f:
        for line in f:
            if search_string in line:
                yield line.strip('\n')
for matching_line in grep('hello', 'data.txt'):
    print(matching_line)
hello c++
hello fortran
hello python
# Example: tail
!tail -f data.txt
hello c++
hello fortran
hello python
hello lisp
^C
def tail(filename):
    with open(filename) as f:
        while True:
            line = f.readline()
            if not line:
                time.sleep(1)
                continue
            yield line
for line in tail('data.txt'):
    print(line)
hello c++

hello fortran

hello python

hello lisp

hello rust
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[207], line 1
----> 1 for line in tail('data.txt'):
      2     print(line)

Cell In[206], line 6, in tail(filename)
      4 line = f.readline()
      5 if not line:
----> 6     time.sleep(1)
      7     continue
      8 yield line

KeyboardInterrupt: 
# a more general way
f = open('data.txt')

def grep(search_string, f):
    for line in f:
       if search_string in line:
             yield line.strip('\n')
for matching_line in grep('hello', f):
    print(matching_line)
hello c++
hello fortran
hello python
hello lisp
hello rust
def tail(f):
    while True:
        line = f.readline()
        if not line:
            time.sleep(1)
            continue
        yield line
f = open('data.txt')
for line in tail(f):
    print(line)
hello c++

hello fortran

hello python

hello lisp

hello rust
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[212], line 2
      1 f = open('data.txt')
----> 2 for line in tail(f):
      3     print(line)

Cell In[210], line 5, in tail(f)
      3 line = f.readline()
      4 if not line:
----> 5     time.sleep(1)
      6     continue
      7 yield line

KeyboardInterrupt: 
for line in grep('python', tail(open('data.txt'))):
    print(line)
hello python
python for all
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[215], line 1
----> 1 for line in grep('python', tail(open('data.txt'))):
      2     print(line)

Cell In[208], line 5, in grep(search_string, f)
      4 def grep(search_string, f):
----> 5     for line in f:
      6        if search_string in line:
      7              yield line.strip('\n')

Cell In[210], line 5, in tail(f)
      3 line = f.readline()
      4 if not line:
----> 5     time.sleep(1)
      6     continue
      7 yield line

KeyboardInterrupt: 

Exceptions#

1/0
---------------------------------------------------------------------------
ZeroDivisionError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 1/0

ZeroDivisionError: division by zero
try:
    1 / 0
except ZeroDivisionError:
    print("Warning: tried division by zero")

print("Continuing..")
Warning: tried division by zero
Continuing..