Notes

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Notes#

Lecture notes for BB1000 (20034) vt25

Basics#

Numeric data types#

8 * 9 # expression: values and operators

type(8 * 9)

int

type(8 / 9)

float

8 / 9 

0.8888888888888888

8 // 9 # floor division

11 // 9

11 % 9  # modules , remainder

Complex number

\[ z = x + i*y \]

x: real part y: imaginary part

\[ i^2 = -1 \]

type(1j)

complex

1j * 1j

(-1+0j)

1j ** 2

(-1+0j)

True

True

False

False

type(True)

bool

True or False

True

True and False

False

123 % 2 == 0 # True for even numbers

False

124 % 2 == 0 # True for even numbers

True

1 + 2 == 3

True

0.1 + 0.2 == 0.3

False

0.1 + 0.2

0.30000000000000004

Strings#

'Hello world'

'Hello world'

type('Hello world')

str

"Hello world"

'Hello world'

'Hello' == "Hello"

True

'It's time to go'

  Cell In[27], line 1
    'It's time to go'
                    ^
SyntaxError: unterminated string literal (detected at line 1)

'It\'s time to go'  # here \' means the literal apostrophe

"It's time to go"

"It's time to go"

"It's time to go"

"It's time\nto go"  # \n refers to newline character

"It's time\nto go"

print("It's time\nto go")

It's time
to go

"It's time
to go"

  Cell In[32], line 1
    "It's time
    ^
SyntaxError: unterminated string literal (detected at line 1)

"""It's time
to go"""

"It's time\nto go"

# string operators
"Hello" + " world"

'Hello world'

"Hello" * " world"

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[35], line 1
----> 1 "Hello" * " world"

TypeError: can't multiply sequence by non-int of type 'str'

"Hello" * 3

'HelloHelloHello'

print("Hello world" + "\n" +  len("Hello world")*'_')

Hello world
___________

# datatypes are also functions that convert values from one type to another
type(int('77'))

int

# ask year of birth and report age
print("What year were you born?")
year_of_birth = input()
print(year_of_birth)
age = 2025 - int(year_of_birth)
print("This year you will be " + str(age) + " years old")
print("This year you will be", age, "years old")
print(f"This year you will be {age} years old")

What year were you born?

1961
This year you will be 64 years old
This year you will be 64 years old
This year you will be 64 years old

Lists#

colours = ['hearts', 'spades', 'diamonds', 'clubs']
values = [2, 3, 4, 5, 6, 7, 8, 9, 10, 'knight', 'queen', 'king', 'ace']

dir(colours)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

colours + colours

['hearts',
 'spades',
 'diamonds',
 'clubs',
 'hearts',
 'spades',
 'diamonds',
 'clubs']

len(colours)

len(values)

colours[0] # the first elements

'hearts'

colours[3] 

'clubs'

colours[-1]

'clubs'

values.append('joker')

values

[2, 3, 4, 5, 6, 7, 8, 9, 10, 'knight', 'queen', 'king', 'ace', 'joker']

values.pop?

Signature: values.pop(index=-1, /)
Docstring:
Remove and return item at index (default last).

Raises IndexError if list is empty or index is out of range.
Type:      builtin_function_or_method

values.pop()

'joker'

values

[2, 3, 4, 5, 6, 7, 8, 9, 10, 'knight', 'queen', 'king', 'ace']

colours

['hearts', 'spades', 'diamonds', 'clubs']

colours.sort()

colours

['clubs', 'diamonds', 'hearts', 'spades']

colours.reverse()

colours

['spades', 'hearts', 'diamonds', 'clubs']

Tuples#

card1 = (colours[0], values[0])

card1

('spades', 2)

dir(card1)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'count',
 'index']

card1.index(2)

Slicing#

values

[2, 3, 4, 5, 6, 7, 8, 9, 10, 'knight', 'queen', 'king', 'ace']

# list[start: end: step]
values[1: 5]

[3, 4, 5, 6]

values[0: 13: 2] # every second element

[2, 4, 6, 8, 10, 'queen', 'ace']

values[::2]

[2, 4, 6, 8, 10, 'queen', 'ace']

# the first 9 elements
values[:9]

[2, 3, 4, 5, 6, 7, 8, 9, 10]

# the last 4 elements
values[-4:]

['knight', 'queen', 'king', 'ace']

#cmopare
"Hello world"[-4:]

'orld'

Dictionaries#

d = {'a': 1, 'b': 2}

len(d)

d.keys()

dict_keys(['a', 'b'])

d.values()

dict_values([1, 2])

d['a']

d['b']

{'a': 1, 'b': 2} == {'b': 2, 'a': 1}

True

# Card game

game = {
    'player1': [],
    'player2': []
}

deck = [ ('hearts', 2), ('clubs', 4), ('diamonds', 10), ('spades', 5) ]

game['player1'].append(deck.pop())
print(game)
print(deck)
game['player2'].append(deck.pop())
print(game)
print(deck)
game['player1'].append(deck.pop())
print(game)
print(deck)
game['player2'].append(deck.pop())
print(game)
print(deck)

{'player1': [('spades', 5)], 'player2': []}
[('hearts', 2), ('clubs', 4), ('diamonds', 10)]
{'player1': [('spades', 5)], 'player2': [('diamonds', 10)]}
[('hearts', 2), ('clubs', 4)]
{'player1': [('spades', 5), ('clubs', 4)], 'player2': [('diamonds', 10)]}
[('hearts', 2)]
{'player1': [('spades', 5), ('clubs', 4)], 'player2': [('diamonds', 10), ('hearts', 2)]}
[]

game

{'player1': [('spades', 5), ('clubs', 4)],
 'player2': [('diamonds', 10), ('hearts', 2)]}

Repetition#

for-loops#

colours

['spades', 'hearts', 'diamonds', 'clubs']

deck = []
for colour in colours:
    print(colour)
    for value in values:
        print(value, end=' ')
        card = (colour, value)
        deck.append(card)
    print()

spades
2 3 4 5 6 7 8 9 10 knight queen king ace 
hearts
2 3 4 5 6 7 8 9 10 knight queen king ace 
diamonds
2 3 4 5 6 7 8 9 10 knight queen king ace 
clubs
2 3 4 5 6 7 8 9 10 knight queen king ace 

len(deck)

#for x in game.keys():
for player in game:
    print(player)
    print(f"Player {player} has hand {game[player]}")

player1
Player player1 has hand [('spades', 5), ('clubs', 4)]
player2
Player player2 has hand [('diamonds', 10), ('hearts', 2)]

for x in game.items():
    print(x)

('player1', [('spades', 5), ('clubs', 4)])
('player2', [('diamonds', 10), ('hearts', 2)])

for x in game.items():
    player = x[0]
    hand = x[1]
    print(f"Player {player} has hand {hand}")

Player player1 has hand [('spades', 5), ('clubs', 4)]
Player player2 has hand [('diamonds', 10), ('hearts', 2)]

for x in game.items():
    player, hand = x
    print(f"Player {player} has hand {hand}")

Player player1 has hand [('spades', 5), ('clubs', 4)]
Player player2 has hand [('diamonds', 10), ('hearts', 2)]

for player, hand in game.items():
    print(f"Player {player} has hand {hand}")

Player player1 has hand [('spades', 5), ('clubs', 4)]
Player player2 has hand [('diamonds', 10), ('hearts', 2)]

Branching#

if statements#

# game where largest sum of value wins

if False:
    print('yes')
else:
    print('no')

no

# sum up card values for player 1
value1 = 0
for card in game['player1']:
    value1 = value1 + card[1]
print(value1)

# sum up card values for player 2
value2 = 0
for card in game['player2']:
    value2 = value2 + card[1]
print(value2)

# The player with largest value wins
if value1 > value2:
    print("Player1 wins")
else:
    print("Player2 wins")

Player2 wins

Functions#

print

<function print(*args, sep=' ', end='\n', file=None, flush=False)>

print("Hello")

Hello

len

<function len(obj, /)>

len('abc')

len([1,2,])

len({'a': 1})

input

<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x71a69fe93620>>

input()

'hoho'

#dir()

dir(__builtins__)

['ArithmeticError',
 'AssertionError',
 'AttributeError',
 'BaseException',
 'BaseExceptionGroup',
 'BlockingIOError',
 'BrokenPipeError',
 'BufferError',
 'BytesWarning',
 'ChildProcessError',
 'ConnectionAbortedError',
 'ConnectionError',
 'ConnectionRefusedError',
 'ConnectionResetError',
 'DeprecationWarning',
 'EOFError',
 'Ellipsis',
 'EncodingWarning',
 'EnvironmentError',
 'Exception',
 'ExceptionGroup',
 'False',
 'FileExistsError',
 'FileNotFoundError',
 'FloatingPointError',
 'FutureWarning',
 'GeneratorExit',
 'IOError',
 'ImportError',
 'ImportWarning',
 'IndentationError',
 'IndexError',
 'InterruptedError',
 'IsADirectoryError',
 'KeyError',
 'KeyboardInterrupt',
 'LookupError',
 'MemoryError',
 'ModuleNotFoundError',
 'NameError',
 'None',
 'NotADirectoryError',
 'NotImplemented',
 'NotImplementedError',
 'OSError',
 'OverflowError',
 'PendingDeprecationWarning',
 'PermissionError',
 'ProcessLookupError',
 'PythonFinalizationError',
 'RecursionError',
 'ReferenceError',
 'ResourceWarning',
 'RuntimeError',
 'RuntimeWarning',
 'StopAsyncIteration',
 'StopIteration',
 'SyntaxError',
 'SyntaxWarning',
 'SystemError',
 'SystemExit',
 'TabError',
 'TimeoutError',
 'True',
 'TypeError',
 'UnboundLocalError',
 'UnicodeDecodeError',
 'UnicodeEncodeError',
 'UnicodeError',
 'UnicodeTranslateError',
 'UnicodeWarning',
 'UserWarning',
 'ValueError',
 'Warning',
 'ZeroDivisionError',
 '_IncompleteInputError',
 '__IPYTHON__',
 '__build_class__',
 '__debug__',
 '__doc__',
 '__import__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'abs',
 'aiter',
 'all',
 'anext',
 'any',
 'ascii',
 'bin',
 'bool',
 'breakpoint',
 'bytearray',
 'bytes',
 'callable',
 'chr',
 'classmethod',
 'compile',
 'complex',
 'copyright',
 'credits',
 'delattr',
 'dict',
 'dir',
 'display',
 'divmod',
 'enumerate',
 'eval',
 'exec',
 'execfile',
 'filter',
 'float',
 'format',
 'frozenset',
 'get_ipython',
 'getattr',
 'globals',
 'hasattr',
 'hash',
 'help',
 'hex',
 'id',
 'input',
 'int',
 'isinstance',
 'issubclass',
 'iter',
 'len',
 'license',
 'list',
 'locals',
 'map',
 'max',
 'memoryview',
 'min',
 'next',
 'object',
 'oct',
 'open',
 'ord',
 'pow',
 'print',
 'property',
 'range',
 'repr',
 'reversed',
 'round',
 'runfile',
 'set',
 'setattr',
 'slice',
 'sorted',
 'staticmethod',
 'str',
 'sum',
 'super',
 'tuple',
 'type',
 'vars',
 'zip']

# define our own function, make this a function
"""
value1 = 0
for card in game['player1']:
    value1 = value1 + card[1]
print(value1)
"""

"\nvalue1 = 0\nfor card in game['player1']:\n    value1 = value1 + card[1]\nprint(value1)\n"

A function definition

start with the def keyword, a name, parentheses, colon
function body (indented)
values are passed to the function as arguments
parameters of a function are variables the hold these values
variables defined inside the function are not valid outside the function

The lines in a function body are executed when the function is called

the value of a function call is defined by a return statement in the function body
the None object represents absence of something, is returned in the absence of a return statement

global scope: variables not defined in any function local scope: variable defined inside a function (only valid locally)

a function can use value in the global scope

#define the function
def calculate_hand_value(card_game, player):
    value = 0
    for card in card_game[player]:
        value = value + card[1]
    print(value)
    return value

# call function
calculate_hand_value(game, 'player1') #value of game is passed to function parameter card_game, 'player1' is passed to parameter player
calculate_hand_value(game, 'player2')

9
12

result = calculate_hand_value(game, 'player1')
result == 9

True

print(result)

print(value)

ace

# alternative with global game
def calculate_hand_value2(player):
    value = 0
    for card in game[player]:
        value = value + card[1]
    print(value)
    return value

calculate_hand_value2('player1')

print?

Signature: print(*args, sep=' ', end='\n', file=None, flush=False)
Docstring:
Prints the values to a stream, or to sys.stdout by default.

sep
  string inserted between values, default a space.
end
  string appended after the last value, default a newline.
file
  a file-like object (stream); defaults to the current sys.stdout.
flush
  whether to forcibly flush the stream.
Type:      builtin_function_or_method

print('Hello', 'world', end='')  # args parameter will be the tuple ('Hello', 'world')
print('Hello', 'world', sep='-')

Hello worldHello-world

# previous example with keyword
# alternative with global game as an optional argument
def calculate_hand_value3(player, card_game=game):
    """
    Calculate the value of the hand of player
    """
    _value = 0
    for _card in card_game[player]:
        _value = _value + _card[1]
    return _value

calculate_hand_value3?

Signature:
calculate_hand_value3(
    player,
    card_game={'player1': [('spades', 5), ('clubs', 4)], 'player2': [('diamonds', 10), ('hearts', 2)]},
)
Docstring: Calculate the value of the hand of player
File:      /tmp/ipykernel_1077854/3892734418.py
Type:      function

help(calculate_hand_value3)

Help on function calculate_hand_value3 in module __main__:

calculate_hand_value3(
    player,
    card_game={'player1': [('spades', 5), ('clubs', 4)], 'player2': [('diamonds', 10), ('hearts', 2)]}
)
    Calculate the value of the hand of player

calculate_hand_value3('player1')

_value # only defined locally inside the function, not globally

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[120], line 1
----> 1 _value # only defined locally inside the function, not globally

NameError: name '_value' is not defined

Modules#

import hello

The value of __name__ is hello

dir(hello)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'hello_world',
 'sys']

hello.hello_world()

Hello -f
Goodbye

import on a python file runs all code in it
definitions in the file are saved in so called namespace

# %load hello.py
import sys

def hello_world():
    if len(sys.argv) > 1:
        print("Hello", sys.argv[1])
    else:
        print("Hello")
    print("Goodbye")


if __name__ == "__main__": #do not run this code during import
     hello_world()
    
print("The value of __name__ is", __name__)

#import os
#dir(os)

import math
dir(math)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'acos',
 'acosh',
 'asin',
 'asinh',
 'atan',
 'atan2',
 'atanh',
 'cbrt',
 'ceil',
 'comb',
 'copysign',
 'cos',
 'cosh',
 'degrees',
 'dist',
 'e',
 'erf',
 'erfc',
 'exp',
 'exp2',
 'expm1',
 'fabs',
 'factorial',
 'floor',
 'fma',
 'fmod',
 'frexp',
 'fsum',
 'gamma',
 'gcd',
 'hypot',
 'inf',
 'isclose',
 'isfinite',
 'isinf',
 'isnan',
 'isqrt',
 'lcm',
 'ldexp',
 'lgamma',
 'log',
 'log10',
 'log1p',
 'log2',
 'modf',
 'nan',
 'nextafter',
 'perm',
 'pi',
 'pow',
 'prod',
 'radians',
 'remainder',
 'sin',
 'sinh',
 'sqrt',
 'sumprod',
 'tan',
 'tanh',
 'tau',
 'trunc',
 'ulp']

math.pi

3.141592653589793

math.sin(math.pi)

1.2246467991473532e-16

math.sin(math.pi/2)

1.0

#optional import
from math import pi, sin, cos

sin(pi/4)

0.7071067811865475

cos(pi/4)

0.7071067811865476

Files#

f = open('hello.txt', mode='r') # open for reading an existing file

#dir(f)
f.read()

'Hello\nGoodbye\n'

f.read()

''

f = open('hello.txt', mode='r') # open for reading an existing file
f.readable()
f.readline() # read one lnie at a tome

'Hello\n'

f.readline()

'Goodbye\n'

f.readline()

''

f = open('hello.txt', mode='r') # open for reading an existing file

f.readlines() # read the file into a list of lines

['Hello\n', 'Goodbye\n']

f = open('hello.txt', mode='r') # open for reading an existing file
for line in f:
    #print(line.upper(), end='')
    print(line.strip().upper())

HELLO
GOODBYE

new = open('newhello.txt', mode='w')

new.write("Hello")
new.write("World!")

!ls -l

total 3600
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25   15308 maj 16 14:23 advanced-2025-05-16.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  256100 apr 15 08:59 child_mortality_0_5_year_olds_dying_per_1000_born.csv
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25   21434 apr 25 14:03 classes-2025-04-25.ipynb
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25    5000 apr  8 15:58 data.txt
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25    1816 mar 31 16:00 Demo.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  298018 apr 15 08:58 gdp_pcap.csv
drwxrwxr-x 4 bb1000-vt25 bb1000-vt25    4096 maj  9 14:40 git-2025-05-09
drwxrwxr-x 6 bb1000-vt25 bb1000-vt25    4096 maj 19 16:26 git_demo
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     280 apr  1 08:49 hello.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25      14 mar 31 16:46 hello.txt
drwxrwxr-x 3 bb1000-vt25 bb1000-vt25    4096 apr 28 16:43 htmlcov
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     241 apr 28 16:52 leap.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     149 mar 31 16:30 mult_table.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25      35 apr 29 09:32 my_math.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25       0 maj 20 10:21 newhello.txt
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25 1332300 maj 20 10:20 Notes.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25    2750 apr 14 16:42 numbers.txt
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25   41855 apr 11 13:55 numpy-2025-04-11.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25   21699 apr 11 13:24 numpy-2025-04-11.zip
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25      18 apr 14 16:21 omdb.py
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25  425009 apr 14 16:34 pandas-2025-04-11.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  306031 apr 14 15:20 pandas-2025-04-11.zip
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     116 apr  8 09:31 plotdemo.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  335155 apr 15 09:01 pop.csv
drwxrwxr-x 2 bb1000-vt25 bb1000-vt25    4096 apr 29 10:42 __pycache__
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25  493778 apr 15 10:21 rosling.png
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25   18244 apr 29 10:53 testing-2025-04-29.ipynb
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     323 apr 28 16:34 test_leap.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     169 apr 29 09:32 test_my_math.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     323 apr 29 10:45 test_timestamps.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     316 apr 29 10:26 timestamps1.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     292 apr 29 10:38 timestamps2.py
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     640 apr 29 09:55 timestamps.py
-rw-r--r-- 1 bb1000-vt25 bb1000-vt25    6742 apr 14 14:49 TSLA.csv
-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25     113 apr 29 09:37 untitled.md

new.close()

!ls -l newhello.txt

-rw-rw-r-- 1 bb1000-vt25 bb1000-vt25 11 maj 20 10:21 newhello.txt

!cat newhello.txt

HelloWorld!

# use write method to get separate lines

with open('newhello.txt', 'w') as new: 
    new.write("Hello\n")
    new.write("World!\n")

!cat newhello.txt

Hello
World!

# add text to end of a file
with open("newhello.txt", 'a') as f:
    f.write("Goodbye")

!cat newhello.txt

Hello
World!
Goodbye

pathlib module#

import pathlib

ls /home/bb1000-vt25/Downloads/MOCK_DATA.csv

/home/bb1000-vt25/Downloads/MOCK_DATA.csv

# in windows   C:\HOME\BB1000\DOWNLOADS\MOCK_DATA.csv

datafile = pathlib.Path.home() / 'Downloads' / 'MOCK_DATA.csv'
datafile

PosixPath('/home/bb1000-vt25/Downloads/MOCK_DATA.csv')

open(datafile).readline()

'id,first_name,last_name,email,gender,ip_address,employee_id,salary,department\n'

#compare mean salaries for different genders

all_lines = open(datafile).readlines()
gender_data = {} # {'gender': [salaries..]}
for line in all_lines[1:]:
    fields = line.split(',') # fields are values of single row/line
    gender = fields[4]
    salary = int(fields[7])
    if gender in gender_data: # is gender present as a key in the dictionary gender_data?
        gender_data[gender].append(salary)
    else:
        gender_data[gender] = [salary]
    

gender_data.keys()

dict_keys(['Male', 'Genderfluid', 'Female', 'Genderqueer', 'Non-binary', 'Bigender', 'Polygender', 'Agender'])

for gender, salaries in gender_data.items():
    mean = round(sum(salaries)/len(salaries))
    print(gender, mean, len(salaries))

Male 84138 444
Genderfluid 95577 17
Female 86417 448
Genderqueer 86913 24
Non-binary 86858 17
Bigender 87399 20
Polygender 77018 18
Agender 96232 12

csv module#

import csv

#dir(csv)

for line in csv.reader(open(datafile)):
    print(line)
    break

['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address', 'employee_id', 'salary', 'department']

csv.reader?

Docstring:
csv_reader = reader(iterable [, dialect='excel']
                        [optional keyword args])
    for row in csv_reader:
        process(row)

The "iterable" argument can be any object that returns a line
of input for each iteration, such as a file object or a list.  The
optional "dialect" parameter is discussed below.  The function
also accepts optional keyword arguments which override settings
provided by the dialect.

The returned object is an iterator.  Each iteration returns a row
of the CSV file (which can span multiple input lines).
Type:      builtin_function_or_method

all_lines = list(csv.reader(open(datafile)))

all_lines[1:5]

[['1',
  'Lucius',
  'Feehery',
  'lfeehery0@wiley.com',
  'Male',
  '155.143.110.178',
  '1',
  '123759',
  'Legal'],
 ['2',
  'Melina',
  'Jossel',
  'mjossel1@slideshare.net',
  'Genderfluid',
  '238.182.167.147',
  '2',
  '91045',
  'Support'],
 ['3',
  'Davide',
  'Allin',
  'dallin2@com.com',
  'Male',
  '151.26.69.126',
  '3',
  '66863',
  'Accounting'],
 ['4',
  'Angeline',
  'Got',
  'agot3@amazon.com',
  'Female',
  '225.181.149.174',
  '4',
  '45217',
  'Human Resources']]

csv.DictReader?

Init signature:
csv.DictReader(
    f,
    fieldnames=None,
    restkey=None,
    restval=None,
    dialect='excel',
    *args,
    **kwds,
)
Docstring:      <no docstring>
File:           ~/miniconda3/envs/bb1000/lib/python3.13/csv.py
Type:           type
Subclasses:     

for line in c: # line will be a dictionary with header values as keys
    print(line)
    break

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[167], line 1
----> 1 for line in c: # line will be a dictionary with header values as keys
      2     print(line)
      3     break

NameError: name 'c' is not defined

gender_data = {} # {'gender': [salaries..]}
for line in csv.DictReader(open(datafile)):
    salary = int(line['salary'])
    gender = line['gender']
    if gender in gender_data: # is gender present as a key in the dictionary gender_data?
        gender_data[gender].append(salary)
    else:
        gender_data[gender] = [salary]

for gender, salaries in gender_data.items():
    mean = round(sum(salaries)/len(salaries))
    print(gender, mean, len(salaries))

Male 84138 444
Genderfluid 95577 17
Female 86417 448
Genderqueer 86913 24
Non-binary 86858 17
Bigender 87399 20
Polygender 77018 18
Agender 96232 12

# with defaultdict from the collections module

import collections

collections.defaultdict?

Init signature: collections.defaultdict(self, /, *args, **kwargs)
Docstring:     
defaultdict(default_factory=None, /, [...]) --> dict with default factory

The default factory is called without arguments to produce
a new value when a key is not present, in __getitem__ only.
A defaultdict compares equal to a dict with the same items.
All remaining arguments are treated the same as if they were
passed to the dict constructor, including keyword arguments.
File:           ~/miniconda3/envs/bb1000/lib/python3.13/collections/__init__.py
Type:           type
Subclasses:     FreezableDefaultDict

counter = collections.defaultdict(int)

int()

counter

defaultdict(int, {})

counter['newkey']

counter

defaultdict(int, {'newkey': 0})

counter['another key'] = counter['anohter key'] + 1

counter

defaultdict(int, {'newkey': 0, 'anohter key': 0, 'another key': 1})

# our previous example
#gender_data = {} # {'gender': [salaries..]}
gender_data = collections.defaultdict(list)
for line in csv.DictReader(open(datafile)):
    salary = int(line['salary'])
    gender = line['gender']
    #if gender in gender_data: # is gender present as a key in the dictionary gender_data?
    #    gender_data[gender].append(salary)
    #else:
    #    gender_data[gender] = [salary]
    gender_data[gender].append(salary)

for gender, salaries in gender_data.items():
    mean = round(sum(salaries)/len(salaries))
    print(gender, mean, len(salaries))

Male 84138 444
Genderfluid 95577 17
Female 86417 448
Genderqueer 86913 24
Non-binary 86858 17
Bigender 87399 20
Polygender 77018 18
Agender 96232 12

def select_second(t):
    return t[1]

# print employees by department sorted by last name
department_personel = collections.defaultdict(list)
for line in csv.DictReader(open(datafile)):
    dept = line['department']
    name = (line['first_name'], line['last_name'])
    #print(line, dept, name, sep='\n')
    #break
    department_personel[dept].append(name)

for dept, names in department_personel.items():
    names.sort(key=select_second)
    print('Department:', dept)
    for first, last in names:
        print(f'\t{first} {last}')

    break

Department: Legal
	Alejoa Aleksankin
	Hollie Attenborrow
	Benetta Bangley
	Arlena Broxton
	Ferd Burriss
	Jenna Callacher
	Valdemar Canland
	Dov Cleverly
	Quinlan Coslett
	Lucho Coules
	Ronnie Cowthart
	Marieann Daughtrey
	Dacy De la croix
	Tucker Deeming
	Pris Deinhardt
	Terri Dobbson
	Homerus Donwell
	Sile Dunderdale
	Daniele Eloi
	Horatius Etherington
	Lucius Feehery
	Marcile Fitzackerley
	Valencia Galtone
	Hugibert Garrie
	Mozelle Gerant
	Kacey Girodier
	Bobinette Gratten
	Ariela Greenstock
	Freddie Gricks
	Meaghan Guinness
	Rheta Handrik
	Maggi Huller
	Nicholas Hum
	Marti Jodlkowski
	Allianora Kaasman
	Hersh Karpenko
	Marcos Kulic
	Nap Ledur
	Allix Lidgard
	Humfried Lympany
	Maynord Manterfield
	Corella Mattussevich
	Samson Mayman
	Wald McCuthais
	Klement McGeown
	Madison Middler
	Randy Moat
	Dagmar Mote
	Chantal Mumby
	Peadar Murtell
	Thorsten Muscott
	Dione Norwich
	Ive O'Codihie
	Elysha Orrick
	Gusella Peach
	Cordie Peasgood
	Tiphany Philpotts
	Bobby Picard
	Cordie Plaskitt
	Hillie Porteous
	Field Poulsom
	Millisent Praten
	Barris Puckinghorne
	Reggy Raden
	Lacee Rulten
	Tersina Shaefer
	Beverlee Sharper
	Montague Sibbs
	Maxwell Simeoni
	Nita Stanborough
	Welsh Stelle
	Urbanus Sullly
	Alex Szymoni
	Rowen Taffrey
	Alyosha Taggett
	Dyan Talloe
	Justen Tessington
	Francisco Tomaselli
	Urbano Trevear
	Victoir Warsop
	Meris Whalley
	Ricardo Yeaman

# how to sort a list fo tuples?

example_list = [('c', 'b'), ('a', 'd'), ('a', 'c')]
example_list.sort()
example_list

[('a', 'c'), ('a', 'd'), ('c', 'b')]

example_list.sort?

Signature: example_list.sort(*, key=None, reverse=False)
Docstring:
Sort the list in ascending order and return None.

The sort is in-place (i.e. the list itself is modified) and stable (i.e. the
order of two equal elements is maintained).

If a key function is given, apply it once to each list item and sort them,
ascending or descending, according to their function values.

The reverse flag can be set to sort in descending order.
Type:      builtin_function_or_method

example_list.sort(key=select_second)
example_list

[('c', 'b'), ('a', 'c'), ('a', 'd')]

csv.DictReader??

Init signature:
csv.DictReader(
    f,
    fieldnames=None,
    restkey=None,
    restval=None,
    dialect='excel',
    *args,
    **kwds,
)
Docstring:      <no docstring>
Source:        
class DictReader:
    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
                 dialect="excel", *args, **kwds):
        if fieldnames is not None and iter(fieldnames) is fieldnames:
            fieldnames = list(fieldnames)
        self._fieldnames = fieldnames   # list of keys for the dict
        self.restkey = restkey          # key to catch long rows
        self.restval = restval          # default value for short rows
        self.reader = reader(f, dialect, *args, **kwds)
        self.dialect = dialect
        self.line_num = 0

    def __iter__(self):
        return self

    @property
    def fieldnames(self):
        if self._fieldnames is None:
            try:
                self._fieldnames = next(self.reader)
            except StopIteration:
                pass
        self.line_num = self.reader.line_num
        return self._fieldnames

    @fieldnames.setter
    def fieldnames(self, value):
        self._fieldnames = value

    def __next__(self):
        if self.line_num == 0:
            # Used only for its side effect.
            self.fieldnames
        row = next(self.reader)
        self.line_num = self.reader.line_num

        # unlike the basic reader, we prefer not to return blanks,
        # because we will typically wind up with a dict full of None
        # values
        while row == []:
            row = next(self.reader)
        d = dict(zip(self.fieldnames, row))
        lf = len(self.fieldnames)
        lr = len(row)
        if lf < lr:
            d[self.restkey] = row[lf:]
        elif lf > lr:
            for key in self.fieldnames[lr:]:
                d[key] = self.restval
        return d

    __class_getitem__ = classmethod(types.GenericAlias)
File:           ~/miniconda3/envs/bb1000/lib/python3.13/csv.py
Type:           type
Subclasses:     

External libraries#

import numpy

#dir(numpy)

import numpy.linalg

dir(numpy.linalg)

['LinAlgError',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_linalg',
 '_umath_linalg',
 'cholesky',
 'cond',
 'cross',
 'det',
 'diagonal',
 'eig',
 'eigh',
 'eigvals',
 'eigvalsh',
 'inv',
 'linalg',
 'lstsq',
 'matmul',
 'matrix_norm',
 'matrix_power',
 'matrix_rank',
 'matrix_transpose',
 'multi_dot',
 'norm',
 'outer',
 'pinv',
 'qr',
 'slogdet',
 'solve',
 'svd',
 'svdvals',
 'tensordot',
 'tensorinv',
 'tensorsolve',
 'test',
 'trace',
 'vecdot',
 'vector_norm']

numpy.linalg.solve?

Signature:       numpy.linalg.solve(a, b)
Call signature:  numpy.linalg.solve(*args, **kwargs)
Type:            _ArrayFunctionDispatcher
String form:     <function solve at 0x71a69c5f8ea0>
File:            ~/miniconda3/envs/bb1000/lib/python3.13/site-packages/numpy/linalg/_linalg.py
Docstring:      
Solve a linear matrix equation, or system of linear scalar equations.

Computes the "exact" solution, `x`, of the well-determined, i.e., full
rank, linear matrix equation `ax = b`.

Parameters
----------
a : (..., M, M) array_like
    Coefficient matrix.
b : {(M,), (..., M, K)}, array_like
    Ordinate or "dependent variable" values.

Returns
-------
x : {(..., M,), (..., M, K)} ndarray
    Solution to the system a x = b.  Returned shape is (..., M) if b is
    shape (M,) and (..., M, K) if b is (..., M, K), where the "..." part is
    broadcasted between a and b.

Raises
------
LinAlgError
    If `a` is singular or not square.

See Also
--------
scipy.linalg.solve : Similar function in SciPy.

Notes
-----
Broadcasting rules apply, see the `numpy.linalg` documentation for
details.

The solutions are computed using LAPACK routine ``_gesv``.

`a` must be square and of full-rank, i.e., all rows (or, equivalently,
columns) must be linearly independent; if either is not true, use
`lstsq` for the least-squares best "solution" of the
system/equation.

.. versionchanged:: 2.0

   The b array is only treated as a shape (M,) column vector if it is
   exactly 1-dimensional. In all other instances it is treated as a stack
   of (M, K) matrices. Previously b would be treated as a stack of (M,)
   vectors if b.ndim was equal to a.ndim - 1.

References
----------
.. [1] G. Strang, *Linear Algebra and Its Applications*, 2nd Ed., Orlando,
       FL, Academic Press, Inc., 1980, pg. 22.

Examples
--------
Solve the system of equations:
``x0 + 2 * x1 = 1`` and
``3 * x0 + 5 * x1 = 2``:

>>> import numpy as np
>>> a = np.array([[1, 2], [3, 5]])
>>> b = np.array([1, 2])
>>> x = np.linalg.solve(a, b)
>>> x
array([-1.,  1.])

Check that the solution is correct:

>>> np.allclose(np.dot(a, x), b)
True
Class docstring:
Class to wrap functions with checks for __array_function__ overrides.

All arguments are required, and can only be passed by position.

Parameters
----------
dispatcher : function or None
    The dispatcher function that returns a single sequence-like object
    of all arguments relevant.  It must have the same signature (except
    the default values) as the actual implementation.
    If ``None``, this is a ``like=`` dispatcher and the
    ``_ArrayFunctionDispatcher`` must be called with ``like`` as the
    first (additional and positional) argument.
implementation : function
    Function that implements the operation on NumPy arrays without
    overrides.  Arguments passed calling the ``_ArrayFunctionDispatcher``
    will be forwarded to this (and the ``dispatcher``) as if using
    ``*args, **kwargs``.

Attributes
----------
_implementation : function
    The original implementation passed in.

a = [[3, 0, 0], [1, 8, 0], [0, 4, -2]]
b = [30, 18, 2]

x = numpy.linalg.solve(a, b)
x

array([10.,  1.,  1.])

x[0] + 3*x[1] + x[2]

np.float64(14.0)

c = numpy.array([1, 3, 1])
c

array([1, 3, 1])

numpy.dot(x, c)

np.float64(14.0)

numpy.linspace(0, 1, 11)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

list(range(0, 11, 1))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

numpy.arange(0, 11, .1)

array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,
        1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,
        2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,
        3.3,  3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,
        4.4,  4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,
        5.5,  5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,
        6.6,  6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,
        7.7,  7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,
        8.8,  8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,
        9.9, 10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9])

numpy.savetxt('numbers.txt', numpy.arange(0, 11, .1).reshape((11, 10)))

numbers = numpy.loadtxt('numbers.txt')
numbers

array([[ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9],
       [ 1. ,  1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9],
       [ 2. ,  2.1,  2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9],
       [ 3. ,  3.1,  3.2,  3.3,  3.4,  3.5,  3.6,  3.7,  3.8,  3.9],
       [ 4. ,  4.1,  4.2,  4.3,  4.4,  4.5,  4.6,  4.7,  4.8,  4.9],
       [ 5. ,  5.1,  5.2,  5.3,  5.4,  5.5,  5.6,  5.7,  5.8,  5.9],
       [ 6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,  6.7,  6.8,  6.9],
       [ 7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,  7.8,  7.9],
       [ 8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,  8.9],
       [ 9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9],
       [10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9]])

# corresponds to
value_list = []
for line in open('numbers.txt'):
    value_list.append([])
    for element in line.split():
        value = float(element)
        value_list[-1].append(value)
value_array= numpy.array(value_list)
value_array

array([[ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9],
       [ 1. ,  1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9],
       [ 2. ,  2.1,  2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9],
       [ 3. ,  3.1,  3.2,  3.3,  3.4,  3.5,  3.6,  3.7,  3.8,  3.9],
       [ 4. ,  4.1,  4.2,  4.3,  4.4,  4.5,  4.6,  4.7,  4.8,  4.9],
       [ 5. ,  5.1,  5.2,  5.3,  5.4,  5.5,  5.6,  5.7,  5.8,  5.9],
       [ 6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,  6.7,  6.8,  6.9],
       [ 7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,  7.8,  7.9],
       [ 8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,  8.9],
       [ 9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9],
       [10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9]])

numbers[:, : ]# all
numbers[:, -1 ] #last column
numbers[:1, :] # first row
numbers[:3, :3] # upper left 3x3 subblock

array([[0. , 0.1, 0.2],
       [1. , 1.1, 1.2],
       [2. , 2.1, 2.2]])

import time
import numpy
n = 512
a = numpy.ones((n, n))
b = numpy.ones((n, n))
c = numpy.zeros((n, n))
t1 = time.time()
for i in range(n):
    for j in range(n):
        for k in range(n):
            c[i, j] += a[i, k]*b[k, j]
t2 = time.time()
print("Loop timing", t2-t1)

Loop timing 118.91819548606873

t1 = time.time()
c = a @ b
t2 = time.time()
print("Loop timing", t2-t1)

Loop timing 0.03766798973083496

arr = numbers[:3, :3]
arr

array([[0. , 0.1, 0.2],
       [1. , 1.1, 1.2],
       [2. , 2.1, 2.2]])

numpy.linalg.inv(arr)

---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
Cell In[207], line 1
----> 1 numpy.linalg.inv(arr)

File ~/miniconda3/envs/bb1000/lib/python3.13/site-packages/numpy/linalg/_linalg.py:609, in inv(a)
    606 signature = 'D->D' if isComplexType(t) else 'd->d'
    607 with errstate(call=_raise_linalgerror_singular, invalid='call',
    608               over='ignore', divide='ignore', under='ignore'):
--> 609     ainv = _umath_linalg.inv(a, signature=signature)
    610 return wrap(ainv.astype(result_t, copy=False))

File ~/miniconda3/envs/bb1000/lib/python3.13/site-packages/numpy/linalg/_linalg.py:104, in _raise_linalgerror_singular(err, flag)
    103 def _raise_linalgerror_singular(err, flag):
--> 104     raise LinAlgError("Singular matrix")

LinAlgError: Singular matrix

numpy.linalg.det(arr)

np.float64(0.0)

a = numpy.array([[3, 0, 0], [1, 8, 0], [0, 4, -2]])
b = numpy.array([30, 18, 2])

x = numpy.linalg.solve(a, b)
x

array([10.,  1.,  1.])

a @ x

array([30., 18.,  2.])

numpy.linalg.inv(a) @ a @ x

array([10.,  1.,  1.])

# a @ x = b
# a(inv) @ a @ x = a(inv) @ b
# x = a(inv) @ b
numpy.linalg.inv(a) @ b

array([10.,  1.,  1.])

Matplotlib#

import matplotlib.pyplot as plt

x = numpy.arange(0, 2*3.1415, .1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2,
       1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5,
       2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8,
       3.9, 4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1,
       5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2])

numpy.sin(x)

array([ 0.        ,  0.09983342,  0.19866933,  0.29552021,  0.38941834,
        0.47942554,  0.56464247,  0.64421769,  0.71735609,  0.78332691,
        0.84147098,  0.89120736,  0.93203909,  0.96355819,  0.98544973,
        0.99749499,  0.9995736 ,  0.99166481,  0.97384763,  0.94630009,
        0.90929743,  0.86320937,  0.8084964 ,  0.74570521,  0.67546318,
        0.59847214,  0.51550137,  0.42737988,  0.33498815,  0.23924933,
        0.14112001,  0.04158066, -0.05837414, -0.15774569, -0.2555411 ,
       -0.35078323, -0.44252044, -0.52983614, -0.61185789, -0.68776616,
       -0.7568025 , -0.81827711, -0.87157577, -0.91616594, -0.95160207,
       -0.97753012, -0.993691  , -0.99992326, -0.99616461, -0.98245261,
       -0.95892427, -0.92581468, -0.88345466, -0.83226744, -0.77276449,
       -0.70554033, -0.63126664, -0.55068554, -0.46460218, -0.37387666,
       -0.2794155 , -0.1821625 , -0.0830894 ])

plt.plot(x, numpy.sin(x), label='sin')
plt.plot(x, numpy.cos(x), label='cos')
plt.legend()
plt.show()

_images/ff542241aca2b529cb4740f7506ef027c8a93ce007500ca152bfd6cd0591a379.png

Pandas#

import pandas as pd

s = pd.Series(range(4))/10

  0.0
  0.1
  0.2
  0.3
dtype: float64

s[1: 2]

1    0.1
dtype: float64

t = pd.Series(range(4), index=['a', 'b', 'c', 'd'])/10

a    0.0
b    0.1
c    0.2
d    0.3
dtype: float64

t['b': 'c']

b    0.1
c    0.2
dtype: float64

# indexing with lists
s[[0, 3]]

0    0.0
3    0.3
dtype: float64

t[['a', 'd']]

a    0.0
d    0.3
dtype: float64

#filtering
s > .1

  False
  False
   True
   True
dtype: bool

s[[False, True, True, False]]

1    0.1
2    0.2
dtype: float64

s[s > .1]

2    0.2
3    0.3
dtype: float64

s.mean()

np.float64(0.15000000000000002)

#dir(s)

Dataframes#

data = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],
       'population': [11.3, 64.3, 81.3, 16.9, 64.9],
       'area': [30510, 671308, 357050, 41526, 244820],
       'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']}

df = pd.DataFrame(data)
df

	country	population	area	capital
0	Belgium	11.3	30510	Brussels
1	France	64.3	671308	Paris
2	Germany	81.3	357050	Berlin
3	Netherlands	16.9	41526	Amsterdam
4	United Kingdom	64.9	244820	London

df.set_index('country')

	population	area	capital
country
Belgium	11.3	30510	Brussels
France	64.3	671308	Paris
Germany	81.3	357050	Berlin
Netherlands	16.9	41526	Amsterdam
United Kingdom	64.9	244820	London

df #unmodified

	country	population	area	capital
0	Belgium	11.3	30510	Brussels
1	France	64.3	671308	Paris
2	Germany	81.3	357050	Berlin
3	Netherlands	16.9	41526	Amsterdam
4	United Kingdom	64.9	244820	London

# equivalent to df = df.set_index('country')
df.set_index('country', inplace=True)

df

	population	area	capital
country
Belgium	11.3	30510	Brussels
France	64.3	671308	Paris
Germany	81.3	357050	Berlin
Netherlands	16.9	41526	Amsterdam
United Kingdom	64.9	244820	London

#select elements
df['capital']

country
Belgium            Brussels
France                Paris
Germany              Berlin
Netherlands       Amsterdam
United Kingdom       London
Name: capital, dtype: object

df['capital']['France']

'Paris'

#alt
df.loc['France', 'capital']

'Paris'

# modify element
df['population']['Belgium'] = 11.4 

/tmp/ipykernel_1077854/1863415940.py:2: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['population']['Belgium'] = 11.4
/tmp/ipykernel_1077854/1863415940.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['population']['Belgium'] = 11.4

df.loc['Belgium', 'population'] = 11.4

df

	population	area	capital
country
Belgium	11.4	30510	Brussels
France	64.3	671308	Paris
Germany	81.3	357050	Berlin
Netherlands	16.9	41526	Amsterdam
United Kingdom	64.9	244820	London

df.iloc[0, 0] = 11.5

df

	population	area	capital
country
Belgium	11.5	30510	Brussels
France	64.3	671308	Paris
Germany	81.3	357050	Berlin
Netherlands	16.9	41526	Amsterdam
United Kingdom	64.9	244820	London

df['density'] = df.population / df.area * 1000

df

	population	area	capital	density
country
Belgium	11.5	30510	Brussels	0.376926
France	64.3	671308	Paris	0.095783
Germany	81.3	357050	Berlin	0.227699
Netherlands	16.9	41526	Amsterdam	0.406974
United Kingdom	64.9	244820	London	0.265093

df.density > .3

country
Belgium            True
France            False
Germany           False
Netherlands        True
United Kingdom    False
Name: density, dtype: bool

df[df.density > .3]

	population	area	capital	density
country
Belgium	11.5	30510	Brussels	0.376926
Netherlands	16.9	41526	Amsterdam	0.406974

df.area /= 1000

df

	population	area	capital	density
country
Belgium	11.5	30.510	Brussels	0.376926
France	64.3	671.308	Paris	0.095783
Germany	81.3	357.050	Berlin	0.227699
Netherlands	16.9	41.526	Amsterdam	0.406974
United Kingdom	64.9	244.820	London	0.265093

df.plot()

<Axes: xlabel='country'>

_images/c5f05156364d8585695d40aa85d3bf8428282ad0fd1f6173e3577a9928e17b4b.png

df.plot(kind='bar')

<Axes: xlabel='country'>

_images/e8b31bebe9e7a4d8e184cfbfe054b9d2aef9cce75bdae57e3b7ba7d8dd41766e.png

# reading csv files
datafile

PosixPath('/home/bb1000-vt25/Downloads/MOCK_DATA.csv')

personel = pd.read_csv(datafile)

personel.salary.max()

np.int64(149702)

personel.salary.min()

np.int64(20157)

personel.salary.mean()

np.float64(85548.704)

#dir(personel)

personel.groupby('gender')['salary'].mean()

gender
Agender        96231.916667
Bigender       87399.050000
Female         86417.080357
Genderfluid    95576.764706
Genderqueer    86913.000000
Male           84138.439189
Non-binary     86857.529412
Polygender     77018.111111
Name: salary, dtype: float64

personel.groupby('gender')['salary'].mean().round()

gender
Agender        96232.0
Bigender       87399.0
Female         86417.0
Genderfluid    95577.0
Genderqueer    86913.0
Male           84138.0
Non-binary     86858.0
Polygender     77018.0
Name: salary, dtype: float64

personel.groupby('gender')['salary'].mean().round().astype(int)

gender
Agender        96232
Bigender       87399
Female         86417
Genderfluid    95577
Genderqueer    86913
Male           84138
Non-binary     86858
Polygender     77018
Name: salary, dtype: int64

# get 10 lowest salaries for females, lowest first
personel[personel.gender == 'Female'].sort_values('salary').head(10)

	id	first_name	last_name	email	gender	ip_address	employee_id	salary	department
122	123	Nettie	Jandak	njandak3e@icq.com	Female	166.251.166.227	123	20317	Marketing
12	13	Lonee	Staunton	lstauntonc@surveymonkey.com	Female	138.24.99.81	13	20815	Marketing
155	156	Cymbre	Balam	cbalam4b@spotify.com	Female	55.156.47.9	156	20942	Engineering
339	340	Anna-maria	Dadge	adadge9f@skyrock.com	Female	91.173.33.3	340	21654	Sales
106	107	Amye	Schofield	aschofield2y@globo.com	Female	35.209.149.103	107	21930	Services
583	584	Terri	Eat	teatg7@wsj.com	Female	49.14.126.170	584	23226	Research and Development
286	287	Lillis	Jude	ljude7y@reddit.com	Female	119.211.41.215	287	23616	Engineering
145	146	Karna	Eyton	keyton41@tripadvisor.com	Female	41.142.223.103	146	23657	Accounting
19	20	Noell	Gadney	ngadneyj@blinklist.com	Female	218.219.155.4	20	23794	Research and Development
191	192	Golda	Cogswell	gcogswell5b@jigsy.com	Female	141.109.221.182	192	24087	Support

personel.plot(kind='box', column='salary', by='gender', vert=False)

salary    Axes(0.125,0.11;0.775x0.77)
dtype: object

_images/c77f06e3bfbf02b1beb6843b3ad645777c282300673a8ccfb524649808276c13.png

personel.groupby('gender')['salary'].describe()

	count	mean	std	min	25%	50%	75%	max
gender
Agender	12.0	96231.916667	33622.378677	38414.0	72965.75	101333.5	119789.75	149702.0
Bigender	20.0	87399.050000	37020.164347	28684.0	52147.25	95875.5	115418.00	138593.0
Female	448.0	86417.080357	38494.376223	20317.0	52904.00	86963.5	121934.00	148645.0
Genderfluid	17.0	95576.764706	32896.581404	36697.0	74373.00	102223.0	120759.00	149576.0
Genderqueer	24.0	86913.000000	42049.463061	22271.0	47929.75	87971.5	121489.25	149019.0
Male	444.0	84138.439189	36493.311676	20157.0	53360.75	82365.0	115660.00	149676.0
Non-binary	17.0	86857.529412	42239.068737	29927.0	48726.00	91899.0	128314.00	148927.0
Polygender	18.0	77018.111111	44610.251539	22343.0	34399.75	69915.5	119753.25	145094.0

Notes on the lab#

x = numpy.arange(10)

y = numpy.arange(1, 20, 2)

len(x), len(y)

(10, 10)

# the coefficient matrix
sum_of_squares_x = x @ x

sum_of_x = sum(x)

sum_ones = len(x)

coefficient_matrix = numpy.array(
[
   [ sum_of_squares_x, sum_of_x],
   [ sum_of_x, len(x)]
])
coefficient_matrix

array([[285,  45],
       [ 45,  10]])

# the right-hand side
rhs = numpy.array([ x @ y, sum(y)])
rhs

array([615, 100])

numpy.linalg.solve(coefficient_matrix, rhs)

array([2., 1.])

### oo plotting

fig, ax = plt.subplots()

_images/1d9291a9ac5bde7c4ddefc7b11da247268e3056dcf3eb1f857e3a5dad85748ba.png

fig, (ax, ax2) = plt.subplots(ncols=2)
#dir(ax)
ax.set_title('Demo title')
ax.set_xlabel('x-data')
ax.set_ylabel('y-data')
ax2.scatter(x, y, marker='o')
ax.plot(x, y, color='red')

[<matplotlib.lines.Line2D at 0x71a6755d4b90>]

_images/d77527930ac61e0e73bf3d1c494e4407033d78700ca159a648021c3c0de33fee.png

import seaborn
seaborn.set_theme(style="ticks", palette="pastel")

seaborn.boxplot(data=personel, y='department', x='salary')

<Axes: xlabel='salary', ylabel='department'>

_images/20dc60c73fdfc4004ba510236b1195222d80f756cda3896cf90b6245cb6a8370.png

seaborn.violinplot(data=personel, y='department', x='salary')

<Axes: xlabel='salary', ylabel='department'>

_images/0a27f26a2aef5f636c9d511e496bae9f07884db8487ee83d931f59cc3eeeb693.png

Interactive plots#

import numpy as np

x = np.arange(-np.pi, np.pi, .1)

import matplotlib.pyplot as plt

plt.plot(x, np.sin(x))
plt.plot(x, np.sin(2*x))

[<matplotlib.lines.Line2D at 0x71a5f1dd9f90>]

_images/d6b751cfeb99dff149a2e55df5f96e9d6e763d50d98d4f4432c794b7c7349f37.png

def sinplot(freq=1.0):
    x = np.arange(-np.pi, np.pi, .1)
    plt.plot(x, np.sin(freq*x))

sinplot(1)
sinplot(2)

#!conda install ipywidgets -y
#!conda install jupyterlab_widgets -y

import ipywidgets

freq_slider = ipywidgets.FloatSlider(min=1, max=2, value=1.5)

ipywidgets.interact(sinplot, freq=freq_slider)

<function __main__.sinplot(freq=1.0)>

plt.plot(x, np.exp(-x**2))
a = 2
plt.plot(x, np.exp(-a*x**2))

[<matplotlib.lines.Line2D at 0x71a5f0b6dbd0>]

_images/e0d84405302c408316b1e64e7c76c83b4344618fff62f16ae415095d7bd7997d.png

def gaussplot(a=1):
    plt.plot(x, np.exp(-a*x**2))

gaussplot(a=1)
gaussplot(a=2)

from ipywidgets import interact, FloatSlider
interact(gaussplot, a=FloatSlider(min=1.0, max=2.0))

<function __main__.gaussplot(a=1)>

Reproducing gapminder presentation by Hans Rosling#

Youtube: https://www.ted.com/talks/hans_rosling_new_insights_on_poverty

screenshot

Can we reproduce the figure to some degree? Data is online at https://www.gapminder.org/data/

Scatterplot
x-value GDP per capita
y-values Infant mortality
each circle is a country
size reflects population
color reflects geographic region

import pandas as pd
pop = pd.read_csv('pop.csv').set_index('country')
pop

	1800	1801	1802	1803	1804	1805	1806	1807	1808	1809	...	2091	2092	2093	2094	2095	2096	2097	2098	2099	2100
country
Afghanistan	3.28M	3.28M	3.28M	3.28M	3.28M	3.28M	3.28M	3.28M	3.28M	3.28M	...	124M	125M	126M	126M	127M	128M	128M	129M	130M	130M
Angola	1.57M	1.57M	1.57M	1.57M	1.57M	1.57M	1.57M	1.57M	1.57M	1.57M	...	139M	140M	142M	143M	144M	145M	147M	148M	149M	150M
Albania	400k	402k	404k	405k	407k	409k	411k	413k	414k	416k	...	1.34M	1.32M	1.3M	1.29M	1.27M	1.25M	1.23M	1.22M	1.2M	1.18M
Andorra	2650	2650	2650	2650	2650	2650	2650	2650	2650	2650	...	52.8k	52.1k	51.5k	50.8k	50.2k	49.6k	49k	48.4k	47.8k	47.2k
UAE	40.2k	40.2k	40.2k	40.2k	40.2k	40.2k	40.2k	40.2k	40.2k	40.2k	...	24.1M	24.3M	24.5M	24.7M	25M	25.2M	25.4M	25.7M	25.9M	26.1M
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Samoa	47.3k	47.3k	47.3k	47.3k	47.3k	47.3k	47.3k	47.2k	47.2k	47.2k	...	370k	372k	374k	375k	377k	378k	380k	381k	382k	384k
Yemen	2.59M	2.59M	2.59M	2.59M	2.59M	2.59M	2.59M	2.59M	2.59M	2.59M	...	107M	107M	107M	108M	108M	109M	109M	109M	110M	110M
South Africa	1.45M	1.45M	1.46M	1.46M	1.47M	1.47M	1.48M	1.49M	1.49M	1.5M	...	92.4M	92.6M	92.9M	93.1M	93.3M	93.5M	93.7M	93.9M	94.1M	94.3M
Zambia	747k	758k	770k	782k	794k	806k	818k	831k	843k	856k	...	61.1M	61.5M	61.9M	62.3M	62.7M	63.1M	63.4M	63.8M	64.1M	64.5M
Zimbabwe	1.09M	1.09M	1.09M	1.09M	1.09M	1.09M	1.09M	1.09M	1.09M	1.09M	...	36.3M	36.4M	36.5M	36.6M	36.7M	36.8M	36.9M	37M	37.1M	37.2M

197 rows × 301 columns

gdp = pd.read_csv('gdp_pcap.csv').set_index('country')
gdp

	1800	1801	1802	1803	1804	1805	1806	1807	1808	1809	...	2091	2092	2093	2094	2095	2096	2097	2098	2099	2100
country
Afghanistan	481	481	481	481	481	481	481	481	481	481	...	4680	4790	4910	5020	5140	5260	5380	5510	5640	5780
Angola	373	374	376	378	379	381	383	385	386	388	...	24.5k	25k	25.6k	26.1k	26.6k	27.1k	27.7k	28.2k	28.8k	29.3k
Albania	469	471	472	473	475	476	477	479	480	482	...	54.5k	55.1k	55.7k	56.3k	56.9k	57.4k	58k	58.6k	59.2k	59.8k
Andorra	1370	1370	1370	1380	1380	1380	1390	1390	1390	1390	...	79.9k	80.2k	80.4k	80.7k	81k	81.3k	81.5k	81.8k	82k	82.3k
UAE	1140	1150	1150	1150	1160	1160	1170	1170	1180	1180	...	92.6k	92.6k	92.6k	92.7k	92.7k	92.7k	92.8k	92.8k	92.8k	92.9k
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Samoa	1600	1600	1600	1600	1600	1600	1600	1600	1600	1600	...	24k	24.5k	25k	25.5k	26k	26.5k	27k	27.6k	28.1k	28.6k
Yemen	1010	1010	1020	1020	1020	1020	1030	1030	1030	1030	...	6170	6320	6470	6620	6780	6950	7120	7290	7470	7650
South Africa	1750	1730	1710	1690	1670	1590	1590	1720	1510	1470	...	44.5k	45.1k	45.7k	46.4k	47k	47.6k	48.2k	48.8k	49.5k	50.1k
Zambia	533	535	536	537	539	539	541	543	543	545	...	16.6k	17k	17.4k	17.8k	18.2k	18.6k	19k	19.4k	19.9k	20.3k
Zimbabwe	919	920	921	922	923	924	925	926	927	928	...	9840	10.1k	10.3k	10.6k	10.8k	11.1k	11.4k	11.6k	11.9k	12.2k

195 rows × 301 columns

childm = pd.read_csv('child_mortality_0_5_year_olds_dying_per_1000_born.csv').set_index('country')
childm

	1800	1801	1802	1803	1804	1805	1806	1807	1808	1809	...	2091	2092	2093	2094	2095	2096	2097	2098	2099	2100
country
Afghanistan	469.0	469.0	469.0	469.0	469.0	469.0	470.0	470.0	470.0	470.0	...	12.60	12.40	12.20	12.00	11.80	11.60	11.50	11.30	11.10	11.10
Angola	486.0	486.0	486.0	486.0	486.0	486.0	486.0	486.0	486.0	486.0	...	17.70	17.50	17.30	17.10	17.00	16.80	16.60	16.40	16.30	16.30
Albania	375.0	375.0	375.0	375.0	375.0	375.0	375.0	375.0	375.0	375.0	...	2.32	2.30	2.27	2.24	2.22	2.19	2.16	2.14	2.11	2.11
Andorra	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0.86	0.84	0.83	0.81	0.80	0.79	0.78	0.77	0.76	0.76
UAE	434.0	434.0	434.0	434.0	434.0	434.0	434.0	434.0	434.0	434.0	...	2.31	2.29	2.26	2.24	2.22	2.19	2.17	2.15	2.13	2.13
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Samoa	471.0	468.0	465.0	461.0	458.0	455.0	452.0	449.0	446.0	443.0	...	3.73	3.70	3.67	3.65	3.62	3.59	3.56	3.54	3.51	3.51
Yemen	540.0	540.0	540.0	540.0	540.0	540.0	540.0	540.0	540.0	540.0	...	14.30	14.10	13.80	13.60	13.40	13.20	13.00	12.80	12.60	12.60
South Africa	398.0	398.0	398.0	398.0	398.0	398.0	398.0	398.0	398.0	398.0	...	10.50	10.40	10.20	10.10	9.95	9.82	9.68	9.55	9.42	9.42
Zambia	410.0	410.0	410.0	410.0	410.0	410.0	410.0	410.0	410.0	410.0	...	12.50	12.30	12.20	12.10	11.90	11.80	11.70	11.60	11.40	11.40
Zimbabwe	396.0	396.0	396.0	396.0	396.0	396.0	396.0	396.0	396.0	396.0	...	14.60	14.50	14.40	14.30	14.20	14.10	13.90	13.80	13.70	13.70

197 rows × 301 columns

x = gdp.loc[['Sweden', 'Norway', 'Denmark'], '1921']

y = childm.loc[['Sweden', 'Norway', 'Denmark'], '1921']

country
Sweden     5200
Norway     4830
Denmark    6980
Name: 1921, dtype: object

country
Sweden     84.6
Norway     71.8
Denmark    91.3
Name: 1921, dtype: float64

plt.scatter(x, -np.log(y))

<matplotlib.collections.PathCollection at 0x71a5e9a5dbd0>

_images/45dd69e084a783cc7afd22df23ff1ca44a8dc0295512b7af8600b7a65908533b.png

len(gdp), len(childm)

(195, 197)

df = pd.DataFrame()
df['GDP'] = gdp.loc[:, '1921']
df['Child mortality'] = childm.loc[:, '1921']
df['Population'] = pop.loc[: ,'1921']
df

	GDP	Child mortality	Population
country
Afghanistan	908	465.0	10.6M
Angola	776	465.0	2.91M
Albania	820	359.0	943k
Andorra	4870	NaN	5610
UAE	2520	416.0	57.3k
...	...	...	...
Samoa	2430	204.0	40.7k
Yemen	1420	517.0	3.49M
South Africa	2220	381.0	7.3M
Zambia	799	392.0	1.48M
Zimbabwe	731	379.0	1.56M

195 rows × 3 columns

df.plot.scatter(x='GDP', y='Child mortality')

<Axes: xlabel='GDP', ylabel='Child mortality'>

_images/f12fa89dfa0fe89ade053c744dacb91c5821b314b695ad1a2459c24bda4d3616.png

Update#

make numeric columns#

How to we translate

k -> \(10^3\)
M -> \(10^6\)
B -> \(10^9\)

in a systematic way?

Consider 3.g. 10.6M meaning \(10.6*10^6\)

A function that that takes the string and splits off the final character

def convert(x: str) -> float:
    factors = {'k': 10**3, 'M': 10**6, 'B': 10**9}
    try:
        if x[-1] in factors:
            return float(x[:-1]) * factors[x[-1]]
        else:
            return float(x)
    except TypeError:
        # do nothing
        return x
        

convert(2.0), convert('3k'), convert('4M'), convert('1B')

(2.0, 3000.0, 4000000.0, 1000000000.0)

The buildin function map applies a function on a sequence of values

for x in map(convert, [2.0, '3k', '4M', '1B']):
    print(x)

2.0
3000.0
4000000.0
1000000000.0

In a similar vein pandas has a map method that is applied to its members

df['Population'].map(convert)

country
Afghanistan     10600000.0
Angola           2910000.0
Albania           943000.0
Andorra             5610.0
UAE                57300.0
                   ...    
Samoa              40700.0
Yemen            3490000.0
South Africa     7300000.0
Zambia           1480000.0
Zimbabwe         1560000.0
Name: Population, Length: 195, dtype: float64

Apply to and repalce all

df = df.map(convert)

to let the population represent be proportional to the area of a circle in the plot we scale it further with the square root

df['Population'] = np.sqrt(df['Population'])

df

	GDP	Child mortality	Population
country
Afghanistan	908.0	465.0	3255.764119
Angola	776.0	465.0	1705.872211
Albania	820.0	359.0	971.081871
Andorra	4870.0	NaN	74.899933
UAE	2520.0	416.0	239.374184
...	...	...	...
Samoa	2430.0	204.0	201.742410
Yemen	1420.0	517.0	1868.154169
South Africa	2220.0	381.0	2701.851217
Zambia	799.0	392.0	1216.552506
Zimbabwe	731.0	379.0	1248.999600

195 rows × 3 columns

fix,ax = plt.subplots()
df.plot.scatter(x='GDP', y='Child mortality', s='Population', ax=ax)

<Axes: xlabel='GDP', ylabel='Child mortality'>

_images/29414511ded29bc4df4cd56932cfebaf6f05b100bbac7b281f41fd2a4b330bbe.png

# scale some more
df['Population'] /= 10

df

	GDP	Child mortality	Population
country
Afghanistan	908.0	465.0	325.576412
Angola	776.0	465.0	170.587221
Albania	820.0	359.0	97.108187
Andorra	4870.0	NaN	7.489993
UAE	2520.0	416.0	23.937418
...	...	...	...
Samoa	2430.0	204.0	20.174241
Yemen	1420.0	517.0	186.815417
South Africa	2220.0	381.0	270.185122
Zambia	799.0	392.0	121.655251
Zimbabwe	731.0	379.0	124.899960

195 rows × 3 columns

df.plot.scatter(x='GDP', y='Child mortality', s='Population')

<Axes: xlabel='GDP', ylabel='Child mortality'>

_images/72173f2ebd4b070059048aa28430d7f454a8613fed6342992b88f5f90845b1f4.png

To get some transparency the alpha keyword can be used.

df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5
)

<Axes: xlabel='GDP', ylabel='Child mortality'>

_images/e7f60a58c5860692cf756a505b5600dcb9c361c075f3a490829679462ce7fa7d.png

The original figure has logarithmic scales

df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
)

<Axes: xlabel='GDP', ylabel='Child mortality'>

_images/334b825f365b4233ce1aedcde4c7fe292a27859e9a0bb3cd99b8f0ad5ea8e143.png

Where are we in all this?

df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
)
df.loc[['Sweden']].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
)

<Axes: xlabel='GDP', ylabel='Child mortality'>

_images/2624a51c75c593012fb5f84dd9a724b6068c344a1f251b616d9d90f732490512.png

As we get more advanced and attach more things it is wise to work with the Axes objects directly

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[['Sweden', 'USA']].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)

<Axes: xlabel='GDP', ylabel='Child mortality'>

_images/bb059d69a07dc322fccddde5efcad5c3391caacf5c4d19d4e258476951ad2330.png

The original has inverted y axis (so that we plot increasing health vs increasing wealth)

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[['Sweden', 'USA']].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)
ax.invert_yaxis()

_images/b827e2018b54ae07199324d721d87fa225e54493a3e46ce67606a2dfe68a3da1.png

Set some text labels for the selected countries, and background year

selected = ['Sweden', 'USA']

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[selected].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)
ax.invert_yaxis()
for country in selected:
    x = df.loc[country, 'GDP']
    y = df.loc[country, 'Child mortality']
    ax.text(1.1*x, y, country, bbox={'boxstyle': 'round', 'facecolor': 'wheat'})
ax.text(.5, .5, '1921', transform=ax.transAxes, fontsize=100, color='gray', alpha=0.25, ha='center', va='center')

Text(0.5, 0.5, '1921')

_images/810a9f16e13349710f856436da4106acd46176cc088b10812956b85559362d4b.png

Set values on the axis for the selected countries

selected = ['Sweden', 'USA']

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[selected].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)
ax.invert_yaxis()
for country in selected:
    x = df.loc[country, 'GDP']
    y = df.loc[country, 'Child mortality']
    ax.text(1.1*x, y, country, bbox={'boxstyle': 'round', 'facecolor': 'wheat'})
ax.text(.5, .5, '1921', transform=ax.transAxes, fontsize=100, color='gray', alpha=0.25, ha='center', va='center')

ax.set_xticks(df.loc[selected, 'GDP'], labels = [str(int(x)) for x in df.loc[selected, 'GDP']], rotation=90)
ax.set_yticks(df.loc[selected, 'Child mortality'], labels = [str(int(y)) for y in df.loc[selected, 'Child mortality']])
ax.grid(True)

_images/1fcbe1850a70b61b95bafab0dde466839dd55e396daeb371f886159d4659f4eb.png

Finally we put in a historical perspective so that all years fit in the graph

selected = ['Sweden', 'USA']

fig, ax = plt.subplots()
df.plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    ax=ax,
)
df.loc[selected].plot.scatter(
    x='GDP', 
    y='Child mortality', 
    s='Population',
    alpha=0.5,
    logx=True,
    logy=True,
    c='green',
    ax=ax
)

for country in selected:
    x = df.loc[country, 'GDP']
    y = df.loc[country, 'Child mortality']
    ax.text(1.1*x, y, country, bbox={'boxstyle': 'round', 'facecolor': 'wheat'})
ax.text(.5, .5, '1921', transform=ax.transAxes, fontsize=100, color='gray', alpha=0.25, ha='center', va='center')

ax.set_xticks(df.loc[selected, 'GDP'], labels = [str(int(x)) for x in df.loc[selected, 'GDP']], rotation=90)
ax.set_yticks(df.loc[selected, 'Child mortality'], labels = [str(int(y)) for y in df.loc[selected, 'Child mortality']])
ax.grid(True)

x_min = gdp.map(convert).min(axis=None)
x_max = gdp.map(convert).max(axis=None)
y_min = childm.map(convert).min(axis=None)
y_max = childm.map(convert).max(axis=None)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)

ax.invert_yaxis()

_images/f379e78911e9789f27dcb2a6126d728335cc3d43868723518ddc2bd9de6b8302.png

Challenges:

Make a movie of frames
Make a slider for the chosen year

Classes#

str1 = "Hello world!"

print(str1)

Hello world!

str1.upper()
"""
Implies that there is a upper method in the str class

class str:
     def upper(self):
         returns a new string replacing lower with upper case

"""

'\nImplies that there is a upper method in the str class\n\nclass str:\n     def upper(self):\n         returns a new string replacing lower with upper case\n\n'

str1.upper?

Signature: str1.upper()
Docstring: Return a copy of the string converted to uppercase.
Type:      builtin_function_or_method

str.upper

<method 'upper' of 'str' objects>

str.upper?

Signature: str.upper(self, /)
Docstring: Return a copy of the string converted to uppercase.
Type:      method_descriptor

str.upper(str1) # call the upper function of the str class with str1 as argument

'HELLO WORLD!'

str1.upper() # whatever class str1 belongs to, call its upper method with str1 as the first argument (usually declared with the name self)

'HELLO WORLD!'

# example
class Person:
    """
    Datatype for personal data
    """
    def __init__(self, given_name, surname):
        self.given_name = given_name
        self.surname = surname
    def __str__(self): # usually a nicely readable string representation of the object
        return f"Person: {self.given_name} {self.surname}"

    def __repr__(self): # usually a unique accurate representation of an object
        return f"Person('{self.given_name}', '{self.surname}')"

Person("Joe", "H.")

Person('Joe', 'H.')

Person('Joe', 'H.')

Person('Joe', 'H.')

Person?

Init signature: Person(given_name, surname)
Docstring:      Datatype for personal data
Type:           type
Subclasses:     

type(int)

type

int()

type(int())

int

Person('Jane', 'Smith')

Person('Jane', 'Smith')

p=Person('Jane', 'Smith')
type(p)

__main__.Person

dir(p)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'given_name',
 'surname']

p.given_name

'Jane'

p.surname

'Smith'

print(p)

Person: Jane Smith

str(p) # delegates to p.__str__()

'Person: Jane Smith'

p.__str__()

'Person: Jane Smith'

p.given_name = "John"

print(p)

Person: John Smith

Person('John', 'Smith')

class Person:
    """
    Datatype for personal data
    """
    number = 0 # class attribute
    def __init__(self, given_name, surname):
        self.given_name = given_name # instance attribute
        self.surname = surname
        self.__class__.number += 1
        
    def __str__(self): # usually a nicely readable string representation of the object
        return f"Person: {self.given_name} {self.surname}"

    def __repr__(self): # usually a unique accurate representation of an object
        return f"Person('{self.given_name}', '{self.surname}')"

    def display_person(self):
        print(f"{self.surname}, {self.given_name}")

p1 = Person("Jane", "Smith")
p2 = Person("John", "Smith") # two different instances of class Person

p1.display_person()
p2.display_person()

Smith, Jane
Smith, John

p1.number

p2.number

Person.number

chosen_attribute = "given_name"
p1.__getattribute__(chosen_attribute)

'Jane'

# Addition operator +

2 + 3

"ab" + "cd"

'abcd'

[1] + [2]

[1, 2]

# Example: complex number

class MyComplex:
    """
    Implements complex numbers in Python
    """
    def __init__(self, re=0, im=0):
        self.re = float(re)
        self.im = float(im)

    def __repr__(self):
        return f"MyComplex({self.re}, {self.im})"

    def __str__(self):
        return f"{self.re} + {self.im}i"

    # def __add__(left, right)
    def __add__(self, other):
        new_z = MyComplex(self.re + other.re, self.im + other.im)
        return new_z
        
    

z = MyComplex()
z

MyComplex(0.0, 0.0)

z.re

0.0

z.im

0.0

str(z)

'0.0 + 0.0i'

z1 = MyComplex(1, 2)
z2 = MyComplex(3, 4)

z1 + z2

MyComplex(4.0, 6.0)

print(z1 + z2)

4.0 + 6.0i

Inheritance#

class Employee(Person):
    "An employee class, derives from Person"
    number = 0
    def __init__(self, given_name, surname, salary):
        #Person.__init__(self, given_name, surname)
        super().__init__(given_name, surname)
        self.salary = salary

e = Employee("Olav", "Vahtras", 0)

e.display_person()

Vahtras, Olav

e.number

dir(e)[0]

'__class__'

e.__class__

__main__.Employee

Testing#

assert True

assert False

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[460], line 1
----> 1 assert False

AssertionError: 

%%file test_my_math.py
from my_math import my_add
from pytest import approx

def test_my_add():
   assert my_add(1, 1) == 2

def test_my_add_floats():
    assert my_add(.1, .2) == approx(.3)
    

Overwriting test_my_math.py

%%file my_math.py
def my_add(x, y):
    return x - y

Overwriting my_math.py

!pytest -v test_my_math.py

============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 2 items                                                              

test_my_math.py::test_my_add FAILED                                      [ 50%]
test_my_math.py::test_my_add_floats FAILED                               [100%]

=================================== FAILURES ===================================
_________________________________ test_my_add __________________________________

    def test_my_add():
>      assert my_add(1, 1) == 2
E      assert 0 == 2
E       +  where 0 = my_add(1, 1)

test_my_math.py:5: AssertionError
______________________________ test_my_add_floats ______________________________

    def test_my_add_floats():
>       assert my_add(.1, .2) == approx(.3)
E       assert -0.1 == 0.3 ± 3.0e-07
E         
E         comparison failed
E         Obtained: -0.1
E         Expected: 0.3 ± 3.0e-07

test_my_math.py:8: AssertionError
=========================== short test summary info ============================
FAILED test_my_math.py::test_my_add - assert 0 == 2
FAILED test_my_math.py::test_my_add_floats - assert -0.1 == 0.3 ± 3.0e-07
============================== 2 failed in 0.08s ===============================

%%file my_math.py
def my_add(x, y):
    return x + y

Overwriting my_math.py

!pytest -v test_my_math.py

============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 2 items                                                              

test_my_math.py::test_my_add PASSED                                      [ 50%]
test_my_math.py::test_my_add_floats PASSED                               [100%]

============================== 2 passed in 0.02s ===============================

%%file test_leap.py
from leap import is_leap_year
def test_leap_year():
    assert is_leap_year(2000) == True
    assert is_leap_year(1999) == False
    assert is_leap_year(1998) == False
    assert is_leap_year(1996) == True
    assert is_leap_year(1900) == False
    assert is_leap_year(1800) == False
    assert is_leap_year(1600) == True

Overwriting test_leap.py

%%file leap.py
def is_leap_year(year):
    """
    Returns True for leap years
    """
    if year % 4 == 0:
        return True
    else:
        return False

Overwriting leap.py

!pytest test_leap.py

============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 1 item                                                               

test_leap.py F                                                           [100%]

=================================== FAILURES ===================================
________________________________ test_leap_year ________________________________

    def test_leap_year():
        assert is_leap_year(2000) == True
        assert is_leap_year(1999) == False
        assert is_leap_year(1998) == False
        assert is_leap_year(1996) == True
>       assert is_leap_year(1900) == False
E       assert True == False
E        +  where True = is_leap_year(1900)

test_leap.py:7: AssertionError
=========================== short test summary info ============================
FAILED test_leap.py::test_leap_year - assert True == False
============================== 1 failed in 0.07s ===============================

%%file leap.py
def is_leap_year(year):
    """
    Returns True for leap years
    """
    if year % 100 == 0:
        if year % 400 == 0:
            return True
        else:
            return False
    else:
        if year % 4 == 0:
            return True
        else:
            return False

Overwriting leap.py

!pytest -v test_leap.py

============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 1 item                                                               

test_leap.py::test_leap_year PASSED                                      [100%]

============================== 1 passed in 0.02s ===============================

# cleanup / improve code

%%file leap.py
def is_leap_year(year):
    """
    Returns True for leap years
    """
    if year % 100 == 0:
        return year % 400 == 0
    else:
        return year % 4 == 0

Overwriting leap.py

!pytest -v

============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 8 items                                                              

git_demo/test_savings.py::test_one_year PASSED                           [ 12%]
git_demo/test_savings.py::test_two_years PASSED                          [ 25%]
git_demo/test_savings.py::test_two_years_only_inital_deposit PASSED      [ 37%]
test_leap.py::test_leap_year PASSED                                      [ 50%]
test_my_math.py::test_my_add PASSED                                      [ 62%]
test_my_math.py::test_my_add_floats PASSED                               [ 75%]
test_timestamps.py::test_1 PASSED                                        [ 87%]
test_timestamps.py::test_6 FAILED                                        [100%]

=================================== FAILURES ===================================
____________________________________ test_6 ____________________________________

    def test_6():
        """
        >>> sum_timestamps(['6:35:32', '2:45:48', '40:10'])
        '10:01:30'
        """
>       assert timestamps(['6:35:32', '2:45:48', '40:10']) == '10:01:30'
E       TypeError: 'module' object is not callable

test_timestamps.py:13: TypeError
=========================== short test summary info ============================
FAILED test_timestamps.py::test_6 - TypeError: 'module' object is not callable
========================= 1 failed, 7 passed in 0.54s ==========================

%%file test_leap.py
from leap import is_leap_year
import pytest

TEST_DATA = [
    (2000, True),
    (1999, False),
    (1998, False),
    (1996, True),
    (1900, False),
    (1800, False),
    (1600, True),
]

@pytest.mark.parametrize('year,expected', TEST_DATA)
def test_leap_year(year, expected):
    assert is_leap_year(year) == expected

Overwriting test_leap.py

!pytest -v

============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0 -- /home/bb1000-vt25/miniconda3/envs/bb1000/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 14 items                                                             

git_demo/test_savings.py::test_one_year PASSED                           [  7%]
git_demo/test_savings.py::test_two_years PASSED                          [ 14%]
git_demo/test_savings.py::test_two_years_only_inital_deposit PASSED      [ 21%]
test_leap.py::test_leap_year[2000-True] PASSED                           [ 28%]
test_leap.py::test_leap_year[1999-False] PASSED                          [ 35%]
test_leap.py::test_leap_year[1998-False] PASSED                          [ 42%]
test_leap.py::test_leap_year[1996-True] PASSED                           [ 50%]
test_leap.py::test_leap_year[1900-False] PASSED                          [ 57%]
test_leap.py::test_leap_year[1800-False] PASSED                          [ 64%]
test_leap.py::test_leap_year[1600-True] PASSED                           [ 71%]
test_my_math.py::test_my_add PASSED                                      [ 78%]
test_my_math.py::test_my_add_floats PASSED                               [ 85%]
test_timestamps.py::test_1 PASSED                                        [ 92%]
test_timestamps.py::test_6 FAILED                                        [100%]

=================================== FAILURES ===================================
____________________________________ test_6 ____________________________________

    def test_6():
        """
        >>> sum_timestamps(['6:35:32', '2:45:48', '40:10'])
        '10:01:30'
        """
>       assert timestamps(['6:35:32', '2:45:48', '40:10']) == '10:01:30'
E       TypeError: 'module' object is not callable

test_timestamps.py:13: TypeError
=========================== short test summary info ============================
FAILED test_timestamps.py::test_6 - TypeError: 'module' object is not callable
========================= 1 failed, 13 passed in 0.47s =========================

# coverage
!conda install pytest-cov -y

Channels:
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
    current version: 25.1.1
    latest version: 25.3.1

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

!pytest test_leap.py --cov leap

============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 7 items                                                              

test_leap.py .......                                                     [100%]

---------- coverage: platform linux, python 3.13.2-final-0 -----------
Name      Stmts   Miss  Cover
-----------------------------
leap.py       4      0   100%
-----------------------------
TOTAL         4      0   100%


============================== 7 passed in 0.06s ===============================

!pytest test_leap.py --cov leap --cov-report=html

============================= test session starts ==============================
platform linux -- Python 3.13.2, pytest-8.3.5, pluggy-1.5.0
rootdir: /home/bb1000-vt25/bb1000
plugins: cov-6.0.0, anyio-4.8.0
collected 7 items                                                              

test_leap.py .......                                                     [100%]

---------- coverage: platform linux, python 3.13.2-final-0 -----------
Coverage HTML written to dir htmlcov


============================== 7 passed in 0.09s ===============================

doctest#

%%file leap.py
def is_leap_year(year):
    """
    Returns True for leap years

    >>> is_leap_year(2000)
    True
    >>> is_leap_year(1900)
    False
    
    """
    if year % 100 == 0:
        return year % 400 == 0
    else:
        return year % 4 == 0

Overwriting leap.py

!python -m doctest leap.py -v

Trying:
    is_leap_year(2000)
Expecting:
    True
ok
Trying:
    is_leap_year(1900)
Expecting:
    False
ok
1 item had no tests:
    leap
1 item passed all tests:
   2 tests in leap.is_leap_year
2 tests in 2 items.
2 passed.
Test passed.

# indentation error
if True:
 print("Hello")
  print("Hello")

  Cell In[481], line 4
    print("Hello")
    ^
IndentationError: unexpected indent

# indentation 2-spaces
if True:
  print("Hello")
  print("Hello")

Hello
Hello

# indentation 4-spaces (PEP8 convention)
if True:
    print("Hello")
    print("Hello")

Hello
Hello

Version control#

!git --help

usage: git [-v | --version] [-h | --help] [-C <path>] [-c <name>=<value>]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p | --paginate | -P | --no-pager] [--no-replace-objects] [--no-lazy-fetch]
           [--no-optional-locks] [--no-advice] [--bare] [--git-dir=<path>]
           [--work-tree=<path>] [--namespace=<name>] [--config-env=<name>=<envvar>]
           <command> [<args>]

These are common Git commands used in various situations:

start a working area (see also: git help tutorial)
   clone     Clone a repository into a new directory
   init      Create an empty Git repository or reinitialize an existing one

work on the current change (see also: git help everyday)
   add       Add file contents to the index
   mv        Move or rename a file, a directory, or a symlink
   restore   Restore working tree files
   rm        Remove files from the working tree and from the index

examine the history and state (see also: git help revisions)
   bisect    Use binary search to find the commit that introduced a bug
   diff      Show changes between commits, commit and working tree, etc
   grep      Print lines matching a pattern
   log       Show commit logs
   show      Show various types of objects
   status    Show the working tree status

grow, mark and tweak your common history
   branch    List, create, or delete branches
   commit    Record changes to the repository
   merge     Join two or more development histories together
   rebase    Reapply commits on top of another base tip
   reset     Reset current HEAD to the specified state
   switch    Switch branches
   tag       Create, list, delete or verify a tag object signed with GPG

collaborate (see also: git help workflows)
   fetch     Download objects and refs from another repository
   pull      Fetch from and integrate with another repository or a local branch
   push      Update remote refs along with associated objects

'git help -a' and 'git help -g' list available subcommands and some
concept guides. See 'git help <command>' or 'git help <concept>'
to read about a specific subcommand or concept.
See 'git help git' for an overview of the system.

configuration#

!git config --global user.name "Olav Vahtras"

!git config --global user.email vahtras@kth.se

!more ~/.gitconfig

[user]
	email = vahtras@kth.se
	name = Olav Vahtras

%cd ~/bb1000

/home/bb1000-vt25/bb1000

%mkdir git_demo
%cd git_demo/

/home/bb1000-vt25/bb1000/git_demo

#initialize current folder for git
! git init .

hint: Using 'master' as the name for the initial branch. This default branch name
hint: is subject to change. To configure the initial branch name to use in all
hint: of your new repositories, which will suppress this warning, call:
hint:
hint: 	git config --global init.defaultBranch <name>
hint:
hint: Names commonly chosen instead of 'master' are 'main', 'trunk' and
hint: 'development'. The just-created branch can be renamed via this command:
hint:
hint: 	git branch -m <name>
Initialized empty Git repository in /home/bb1000-vt25/bb1000/git_demo/.git/

!ls -a

.  ..  .git

!git status

On branch master

No commits yet

nothing to commit (create/copy files and use "git add" to track)

# work...

%%file savings.py
# Calculate retirement savings

def savings_calculator(amount, interest):
    final_amount = amount + amount*interest
    return final_amount

Overwriting savings.py

%%file test_savings.py
import savings

def test_one_year():
    initial_amount = 500
    annual_interest = .05
    expected_final = 525
    calculated_final = savings.savings_calculator(initial_amount, annual_interest)
    assert calculated_final == expected_final

Overwriting test_savings.py

!git status

On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	__pycache__/
	savings.py
	test_savings.py

nothing added to commit but untracked files present (use "git add" to track)

# create .gitignore file to list files to ignore

%%file .gitignore
*.pyc

Writing .gitignore

!git status

On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.gitignore
	savings.py
	test_savings.py

nothing added to commit but untracked files present (use "git add" to track)

# save changes to the cache
!git add .

!git status

On branch master

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)
	new file:   .gitignore
	new file:   savings.py
	new file:   test_savings.py

!git commit -m "Initial commit"

[master (root-commit) a324f3f] Initial commit
 3 files changed, 14 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 savings.py
 create mode 100644 test_savings.py

!git status

On branch master
nothing to commit, working tree clean

!git log

commit a324f3f127df2eb866b3f870901ab0c91baf41ff (HEAD -> master)
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:50:43 2025 +0200

    Initial commit

# new changes to code

%%file savings.py
# Calculate retirement savings

def savings_calculator(amount, interest, years=1):
    final_amount = amount*(1 + interest)**years
    return final_amount

Overwriting savings.py

!git status

On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   savings.py

no changes added to commit (use "git add" and/or "git commit -a")

# add all updated files to the cache
!git add -u

!git status

On branch master
Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   savings.py

#save changes to repository
!git commit -m 'Allow for several years of savings'

[master 0075416] Allow for several years of savings
 1 file changed, 3 insertions(+), 3 deletions(-)

!git status

On branch master
nothing to commit, working tree clean

!git log

commit 0075416fafd7a9f5432bfb9bcd0b7461221215ce (HEAD -> master)
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:55:23 2025 +0200

    Allow for several years of savings

commit a324f3f127df2eb866b3f870901ab0c91baf41ff
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:50:43 2025 +0200

    Initial commit

!git log --oneline

0075416 (HEAD -> master) Allow for several years of savings
a324f3f Initial commit

# new changes

%%file savings.py
# Calculate retirement savings

def savings_calculator(amount, interest, years=1):
    final_amount = 0
    for year in range(years):
        final_amount = (amount + final_amount)*(1 + interest)
    return final_amount

Overwriting savings.py

!git status

On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   savings.py

no changes added to commit (use "git add" and/or "git commit -a")

!git diff

diff --git a/savings.py b/savings.py
index 715f219..46443a6 100644
--- a/savings.py
+++ b/savings.py
@@ -1,5 +1,7 @@
 # Calculate retirement savings
 
 def savings_calculator(amount, interest, years=1):
-    final_amount = amount*(1 + interest)**years
-    return final_amount(bb1000) bb1000-vt25@bat:~/bb1000/git_demo_backup$ 
+    final_amount = 0
+    for year in range(years):
+        final_amount = (amount + final_amount)*(1 + interest)
+    return final_amount

!git add -u

!git status

On branch master
Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   savings.py

!git commit -m "Annual deposits with for loop"

[master ea52645] Annual deposits with for loop
 1 file changed, 4 insertions(+), 2 deletions(-)

!git status

On branch master
nothing to commit, working tree clean

!git log

commit ea526456f52d4a6e361f46ec799a7748ea257469 (HEAD -> master)
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 22:00:38 2025 +0200

    Annual deposits with for loop

commit 0075416fafd7a9f5432bfb9bcd0b7461221215ce
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:55:23 2025 +0200

    Allow for several years of savings

commit a324f3f127df2eb866b3f870901ab0c91baf41ff
Author: Olav Vahtras <vahtras@kth.se>
Date:   Tue May 20 21:50:43 2025 +0200

    Initial commit

!git log --oneline

ea52645 (HEAD -> master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

# have a look at the previous version
!git checkout 0075416

Note: switching to '0075416'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 0075416 Allow for several years of savings

!git log --oneline

0075416 (HEAD) Allow for several years of savings
a324f3f Initial commit

!git log --oneline --all

ea52645 (master) Annual deposits with for loop
0075416 (HEAD) Allow for several years of savings
a324f3f Initial commit

!git switch master

Previous HEAD position was 0075416 Allow for several years of savings
Switched to branch 'master'

!git log --oneline --all

ea52645 (HEAD -> master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

Branches#

Two lines of development

a script that runs from the command line
variation of the years that I want to deposit

two git branches

!git branch script
!git branch varying-deposits

!git log --oneline

ea52645 (HEAD -> master, varying-deposits, script) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

!git switch script

Switched to branch 'script'

!git log --oneline

ea52645 (HEAD -> script, varying-deposits, master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

%%file savings.py
# Calculate retirement savings
import sys

def savings_calculator(amount, interest, years=1):
    final_amount = 0
    for year in range(years):
        final_amount = round((amount + final_amount)*(1 + interest), 2)
    return final_amount

if __name__ == "__main__":
    if len(sys.argv) == 3:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        final = savings_calculator(amount, interest)
        years = 1
    elif len(sys.argv) == 4:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        years = int(sys.argv[3])
        final = savings_calculator(amount, interest, years)
    else:
        print(f"Usage: {sys.argv[0]} amount interest [years]")
        exit()

    print(f"Savings after {years} years: {final}")

Overwriting savings.py

!git add -u

!git commit -m "Include command-line arguments"

[script 65a3b2d] Include command-line arguments
 1 file changed, 19 insertions(+), 1 deletion(-)

!git log --oneline --all

65a3b2d (HEAD -> script) Include command-line arguments
ea52645 (varying-deposits, master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

!git switch varying-deposits

Switched to branch 'varying-deposits'

!git log --oneline --all

65a3b2d (script) Include command-line arguments
ea52645 (HEAD -> varying-deposits, master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

# new changes related to years of depositing

%%file savings.py
# Calculate retirement savings

def savings_calculator(amount, interest, years=1, stop_deposit=None):
    final_amount = 0
    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
        if year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)
    return final_amount

Overwriting savings.py

!git diff

diff --git a/savings.py b/savings.py
index 46443a6..d1611da 100644
--- a/savings.py
+++ b/savings.py
@@ -1,7 +1,13 @@
 # Calculate retirement savings
 
-def savings_calculator(amount, interest, years=1):
+def savings_calculator(amount, interest, years=1, stop_deposit=None):
     final_amount = 0
+    if stop_deposit is None:
+        stop_deposit = years
+
     for year in range(years):
-        final_amount = (amount + final_amount)*(1 + interest)
+        if year < stop_deposit:
+            final_amount = (amount + final_amount)*(1 + interest)
+        else:
+            final_amount = final_amount*(1 + interest)
     return final_amount

!git add -u

!git commit -m "Changes for optional deposit years"

[varying-deposits 335928a] Changes for optional deposit years
 1 file changed, 8 insertions(+), 2 deletions(-)

!git log --oneline --all

335928a (HEAD -> varying-deposits) Changes for optional deposit years
65a3b2d (script) Include command-line arguments
ea52645 (master) Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

# include changes in script to master
!git switch master

Switched to branch 'master'

!git merge script

Updating ea52645..65a3b2d
Fast-forward
 savings.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

!git log --oneline --all

335928a (varying-deposits) Changes for optional deposit years
65a3b2d (HEAD -> master, script) Include command-line arguments
ea52645 Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

!git merge varying-deposits

Auto-merging savings.py
CONFLICT (content): Merge conflict in savings.py
Automatic merge failed; fix conflicts and then commit the result.

!git status

On branch master
You have unmerged paths.
  (fix conflicts and run "git commit")
  (use "git merge --abort" to abort the merge)

Unmerged paths:
  (use "git add <file>..." to mark resolution)
	both modified:   savings.py

no changes added to commit (use "git add" and/or "git commit -a")

The file savings.py in a state of conflict:

# Calculate retirement savings
import sys

def savings_calculator(amount, interest, years=1, stop_deposit=None):
    final_amount = 0
    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
<<<<<<< HEAD
        final_amount = round((amount + final_amount)*(1 + interest), 2)
    return final_amount

if __name__ == "__main__":
    if len(sys.argv) == 3:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        final = savings_calculator(amount, interest)
        years = 1
    elif len(sys.argv) == 4:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        years = int(sys.argv[3])
        final = savings_calculator(amount, interest, years)
    else:
        print(f"Usage: {sys.argv[0]} amount interest [years]")
        exit()

    print(f"Savings after {years} years: {final}")

        
=======
        if year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)
    return final_amount
>>>>>>> varying-deposits

fix merge

%%file savings.py
# Calculate retirement savings
import sys

def savings_calculator(amount, interest, years=1, stop_deposit=None):
    final_amount = 0
    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
        if year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)
    return final_amount

if __name__ == "__main__":
    if len(sys.argv) == 3:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        final = savings_calculator(amount, interest)
        years = 1
    elif len(sys.argv) == 4:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        years = int(sys.argv[3])
        final = savings_calculator(amount, interest, years)
    else:
        print(f"Usage: {sys.argv[0]} amount interest [years]")
        exit()

    print(f"Savings after {years} years: {final}")

    

Overwriting savings.py

!git status

On branch master
You have unmerged paths.
  (fix conflicts and run "git commit")
  (use "git merge --abort" to abort the merge)

Unmerged paths:
  (use "git add <file>..." to mark resolution)
	both modified:   savings.py

no changes added to commit (use "git add" and/or "git commit -a")

%%file savings.py
# Calculate retirement savings
import sys

def savings_calculator(amount, interest, years=1, stop_deposit=None):
    final_amount = 0
    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
        if year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)
    return final_amount

if __name__ == "__main__":
    if len(sys.argv) == 3:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        final = savings_calculator(amount, interest)
        years = 1
    elif len(sys.argv) == 4:
        amount = int(sys.argv[1])
        interest = float(sys.argv[2])
        years = int(sys.argv[3])
        final = savings_calculator(amount, interest, years)
    else:
        print(f"Usage: {sys.argv[0]} amount interest [years]")
        exit()

    print(f"Savings after {years} years: {final}")

Overwriting savings.py

!git add -u

!git commit -m "fix conflicts"

[master a71a30b] fix conflicts

!git status

On branch master
nothing to commit, working tree clean

Remote repositories#

!git remote add kth-github git@gits-15.sys.kth.se:BB1000/git_demo.git

!git remote

kth-github

!git remote -v

kth-github	git@gits-15.sys.kth.se:BB1000/git_demo.git (fetch)
kth-github	git@gits-15.sys.kth.se:BB1000/git_demo.git (push)

!git push kth-github master

Enumerating objects: 20, done.
Counting objects: 100% (20/20), done.
Delta compression using up to 12 threads
Compressing objects: 100% (19/19), done.
Writing objects: 100% (20/20), 2.50 KiB | 2.50 MiB/s, done.
Total 20 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (3/3), done.
To gits-15.sys.kth.se:BB1000/git_demo.git
 + 56b59e6...a71a30b master -> master (forced update)

Enumerating objects: 23, done.
Counting objects: 100% (23/23), done.
Delta compression using up to 12 threads
Compressing objects: 100% (22/22), done.
Writing objects: 100% (23/23), 2.66 KiB | 680.00 KiB/s, done.
Total 23 (delta 6), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (6/6), done.
To gits-15.sys.kth.se:BB1000/git_demo.git
 * [new branch]      master -> master

!git log --oneline --all

a71a30b (HEAD -> master, kth-github/master) fix conflicts
335928a (varying-deposits) Changes for optional deposit years
65a3b2d (script) Include command-line arguments
ea52645 Annual deposits with for loop
0075416 Allow for several years of savings
a324f3f Initial commit

%cd /tmp

!git clone git@gits-15.sys.kth.se:BB1000/git_demo

Cloning into 'git_demo'...
remote: Enumerating objects: 23, done.
remote: Counting objects: 100% (23/23), done.
remote: Compressing objects: 100% (16/16), done.
remote: Total 23 (delta 6), reused 23 (delta 6), pack-reused 0
Receiving objects: 100% (23/23), done.
Resolving deltas: 100% (6/6), done.

%cd git_demo

/tmp/git_demo

!git log --oneline --all

5ee6af4 (HEAD -> master, origin/master, origin/HEAD) fix conflicts
25bbab9 Changes for optional deposit years
a884958 Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit

%cd /tmp/git_demo

/tmp/git_demo

# on second computer/ second user
!git log --oneline --all

85ebcbf (HEAD -> master) Define start end stop deposit years
5ee6af4 (origin/master, origin/HEAD) fix conflicts
25bbab9 Changes for optional deposit years
a884958 Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit

!git remote -v

origin	git@gits-15.sys.kth.se:BB1000/git_demo (fetch)
origin	git@gits-15.sys.kth.se:BB1000/git_demo (push)

!git push origin master

Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 12 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 400 bytes | 400.00 KiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.
To gits-15.sys.kth.se:BB1000/git_demo
   5ee6af4..85ebcbf  master -> master

# back to first computer
%cd ~/bb1000/git_demo

/home/bb1000-vt25/bb1000/git_demo

#update local information
# two possibilities: git pull/ git fetch and git merge

!git remote -v

kth-github	git@gits-15.sys.kth.se:BB1000/git_demo.git (fetch)
kth-github	git@gits-15.sys.kth.se:BB1000/git_demo.git (push)

!git log --oneline --all

5ee6af4 (HEAD -> master, kth-github/master) fix conflicts
25bbab9 (varying-deposits) Changes for optional deposit years
a884958 (script) Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit

!git fetch --all # update our local information about all remote

remote: Enumerating objects: 5, done.
remote: Counting objects: 100% (5/5), done.
remote: Compressing objects: 100% (2/2), done.
remote: Total 3 (delta 1), reused 3 (delta 1), pack-reused 0
Unpacking objects: 100% (3/3), 380 bytes | 380.00 KiB/s, done.
From gits-15.sys.kth.se:BB1000/git_demo
   5ee6af4..85ebcbf  master     -> kth-github/master

!git log --oneline --all

85ebcbf (kth-github/master, kth-github/HEAD) Define start end stop deposit years
5ee6af4 (HEAD -> master) fix conflicts
25bbab9 (varying-deposits) Changes for optional deposit years
a884958 (script) Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit

!git merge kth-github/master

Updating 5ee6af4..85ebcbf
Fast-forward
 savings.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

!git log --oneline --all

85ebcbf (HEAD -> master, kth-github/master, kth-github/HEAD) Define start end stop deposit years
5ee6af4 fix conflicts
25bbab9 (varying-deposits) Changes for optional deposit years
a884958 (script) Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit

# include new updates with git pull
!git pull

remote: Enumerating objects: 5, done.
remote: Counting objects: 100% (5/5), done.
remote: Compressing objects: 100% (3/3), done.
remote: Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
Unpacking objects: 100% (3/3), 395 bytes | 395.00 KiB/s, done.
From gits-15.sys.kth.se:BB1000/git_demo
   85ebcbf..cbce447  master     -> kth-github/master
There is no tracking information for the current branch.
Please specify which branch you want to merge with.
See git-pull(1) for details.

    git pull <remote> <branch>

If you wish to set tracking information for this branch you can do so with:

    git branch --set-upstream-to=kth-github/<branch> master

!git pull kth-github master

From gits-15.sys.kth.se:BB1000/git_demo
 * branch            master     -> FETCH_HEAD
Updating 85ebcbf..cbce447
Fast-forward
 savings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

!git log --oneline --all

cbce447 (HEAD -> master, kth-github/master, kth-github/HEAD) Update savings.py: round output to two decimals
85ebcbf Define start end stop deposit years
5ee6af4 fix conflicts
25bbab9 (varying-deposits) Changes for optional deposit years
a884958 (script) Include command-line arguments
542ba6f Annual deposits with for loop
9eb01ff Allow for several years of savings
700da30 Initial commit

update: plot savings history#

# %load git_demo/savings.py
# Calculate retirement savings
import sys
import matplotlib.pyplot as plt

def savings_calculator(amount, interest, years=1, start_deposit=None, stop_deposit=None, plot=False):
    final_amount = 0
    savings_by_year = []
    if start_deposit is None:
        start_deposit = 0

    if stop_deposit is None:
        stop_deposit = years
        
    for year in range(years):
        if start_deposit <= year < stop_deposit:
            final_amount = (amount + final_amount)*(1 + interest)
        else:
            final_amount = final_amount*(1 + interest)

        final_amount =  round(final_amount, 2)
        #print(year, final_amount)
        savings_by_year.append(final_amount)

    if plot:
        plt.plot(savings_by_year)

        
    return final_amount

Compare early career savings vs late career savings (10000 per year)

final = savings_calculator(10000, .1, years=40, start_deposit=0, stop_deposit=10, plot=True); print("Total deposits", 10000*10, ", final savings", final)
final = savings_calculator(10000, .1, years=40, start_deposit=10, stop_deposit=40, plot=True); print("Total deposits", 10000*30, ", final savings", final)

plt.show()

Total deposits 100000 , final savings 3059084.15
Total deposits 300000 , final savings 1809434.33

_images/cb2ee41e9d23172c900c73f64d2d5a83c1aa8bb209cd057f2d528251d71b608f.png

Conclusion: the late saver who puts down more money never catches up to the early saver, due to compound intererst over several years

Advanced topics#

Decorators#

# function with arbitrary arguments

#example
print('Hello')
print('Hello', 'world')

Hello
Hello world

def f(*args):
    print(args)

f()

()

f('Hello')

('Hello',)

f('Hello', 'world')

('Hello', 'world')

def g(*args, **kwargs):
    print('In function g:')
    print(args)
    print(kwargs)

g()

In function g:
()
{}

g('Hello', who='World')

In function g:
('Hello',)
{'who': 'World'}

def h(*args, **kwargs):
    print('in function h:', args, kwargs)
    g(*args, **kwargs)

h('Hello', who='World')

in function h: ('Hello',) {'who': 'World'}
In function g:
('Hello',)
{'who': 'World'}

# example timing
import math
import time

def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)
    

t1 = time.time()
slow_math(2)
t2 = time.time()
print("Time used in slow_math", round(t2-t1, 1))

Time used in slow_math 1.0

t1 = time.time()
slow_math(3)
t2 = time.time()
print("Time used in slow_math", round(t2-t1, 1))

Time used in slow_math 1.0

# with a decorator - function that takes a function as input, and returns a function

def timeme(function):
    def timed_function(*args, **kwargs):
        t1 = time.time()
        return_value = function(*args, **kwargs)
        t2 = time.time()
        print("Time used", round(t2-t1, 1))
        return return_value
    return timed_function

timeme(slow_math)

<function __main__.timeme.<locals>.timed_function(*args, **kwargs)>

slow_math_timed = timeme(slow_math)

slow_math_timed(4)

Time used 1.0

2.0

# decorator notation

@timeme  # equivalent to slow_math=timeme(slow_math), redefinition
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)

slow_math(2)

Time used 1.0

1.4142135623730951

def debug(function):
    def wrapped(*args, **kwargs):
        print(function.__name__, 'called with', args, kwargs)
        return_value = function(*args, **kwargs)
        print(function.__name__, 'returns', return_value)
        return return_value
    return wrapped

@debug  
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)

slow_math(2)

slow_math called with (2,) {}
slow_math returns 1.4142135623730951

1.4142135623730951

# combination of decorators
@timeme
@debug  
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)

slow_math(2)

slow_math called with (2,) {}
slow_math returns 1.4142135623730951
Time used 1.0

1.4142135623730951

# combination of decorators
@debug  
@timeme
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)

slow_math(2)

timed_function called with (2,) {}
Time used 1.0
timed_function returns 1.4142135623730951

1.4142135623730951

# combinatin of decorators with wraps

from functools import wraps

def timeme(function):
    @wraps(function)
    def timed_function(*args, **kwargs):
        t1 = time.time()
        return_value = function(*args, **kwargs)
        t2 = time.time()
        print("Time used in", function.__name__, round(t2-t1, 1))
        return return_value
    return timed_function

def debug(function):
    @wraps(function)
    def wrapped(*args, **kwargs):
        print(function.__name__, 'called with', args, kwargs)
        return_value = function(*args, **kwargs)
        print(function.__name__, 'returns', return_value)
        return return_value
    return wrapped

# combinatin of decorators with wraps

@timeme
@debug
def slow_math(x):
    time.sleep(1)
    return math.sqrt(x)

slow_math(2)

slow_math called with (2,) {}
slow_math returns 1.4142135623730951
Time used in slow_math 1.0

1.4142135623730951

## Context managers

#t1 = time.time()
#slow_math(2)
#t2 = time.time()
#print("Time used in slow_math", round(t2-t1, 1))

#example: opening files
with open('sample.txt', 'w') as f:
    f.write('hello')

class TimeMe:
    def __enter__(self):
        self.t1 = time.time()

    def __exit__(self, *args):
        self.t2 = time.time()
        print("Time used in block", round(self.t2 - self.t1, 1))
        

with TimeMe():
    time.sleep(2)

Time used in block 2.0

Iterators#

li = ['hello', 'there', 'world']

# primitve iteration
for index in range(len(li)):
    print(index)
    print(li[index])
    

0
hello
1
there
2
world

# pythonic iteration
for member in li:
    print(member)

hello
there
world

for member in enumerate(li):
    print(member)

(0, 'hello')
(1, 'there')
(2, 'world')

for member in enumerate(li, start=1):
    print(member)

(1, 'hello')
(2, 'there')
(3, 'world')

#dictionaries
dictionary = {'a':1, 'b':2}
for k in dictionary:
    print(k, dictionary[k])

a 1
b 2

for k in dictionary.items():
    print(k)

('a', 1)
('b', 2)

# internall in the for loop

dir(dictionary)

['__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

dictionary.__iter__()

<dict_keyiterator at 0x7a37888ef650>

dir(dictionary.__iter__())

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__length_hint__',
 '__lt__',
 '__ne__',
 '__new__',
 '__next__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

dictionary.__iter__().__next__() # get the first value in the sequence

'a'

dictionary_iterator = dictionary.__iter__()  # same as iter(dictionary)

next(dictionary_iterator)

'a'

next(dictionary_iterator)

'b'

next(dictionary_iterator)

---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
Cell In[168], line 1
----> 1 next(dictionary_iterator)

StopIteration: 

# internally what happens during
for k in dictionary:
    print(k)

a
b

Generators#

def myfunction(n):
    i = 0
    return i

myfunction(3)

def myrange(n):
    i = 0
    while i < n:
        yield i
        i += 1

myrange(3)

<generator object myrange at 0x7a37888f9cc0>

for element in myrange(3):
    print(element)

0
1
2

# internally
myrange_iterator = myrange(3)

next(myrange_iterator)

next(myrange_iterator)

next(myrange_iterator)

next(myrange_iterator)

---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
Cell In[190], line 1
----> 1 next(myrange_iterator)

StopIteration: 

#Example: list the first n Fibonacci numbers:  0 1 1 2 3 5 8...

def fib(n):
    counter = 0
    a = 0
    b = 1
    while counter < n:
        yield a
        # b -> a
        # a + b -> b
        a, b = b, a + b
        counter += 1
    
    

for fibnum in fib(5):
    print(fibnum)

list(fib(20))

[0,
 1,
 1,
 2,
 3,
 5,
 8,
 13,
 21,
 34,
 55,
 89,
 144,
 233,
 377,
 610,
 987,
 1597,
 2584,
 4181]

# Example: search lines in a file for something

%%file data.txt
hello c++
hello fortran
hello python

Writing data.txt

!grep python data.txt

hello python

def grep(search_string, filename):
    with open(filename) as f:
        for line in f:
            if search_string in line:
                yield line.strip('\n')

for matching_line in grep('hello', 'data.txt'):
    print(matching_line)

hello c++
hello fortran
hello python

# Example: tail

!tail -f data.txt

hello c++
hello fortran
hello python
hello lisp
^C

def tail(filename):
    with open(filename) as f:
        while True:
            line = f.readline()
            if not line:
                time.sleep(1)
                continue
            yield line

for line in tail('data.txt'):
    print(line)

hello c++

hello fortran

hello python

hello lisp

hello rust

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[207], line 1
----> 1 for line in tail('data.txt'):
      2     print(line)

Cell In[206], line 6, in tail(filename)
      4 line = f.readline()
      5 if not line:
----> 6     time.sleep(1)
      7     continue
      8 yield line

KeyboardInterrupt: 

# a more general way
f = open('data.txt')

def grep(search_string, f):
    for line in f:
       if search_string in line:
             yield line.strip('\n')

for matching_line in grep('hello', f):
    print(matching_line)

hello c++
hello fortran
hello python
hello lisp
hello rust

def tail(f):
    while True:
        line = f.readline()
        if not line:
            time.sleep(1)
            continue
        yield line

f = open('data.txt')
for line in tail(f):
    print(line)

hello c++

hello fortran

hello python

hello lisp

hello rust

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[212], line 2
      1 f = open('data.txt')
----> 2 for line in tail(f):
      3     print(line)

Cell In[210], line 5, in tail(f)
      3 line = f.readline()
      4 if not line:
----> 5     time.sleep(1)
      6     continue
      7 yield line

KeyboardInterrupt: 

for line in grep('python', tail(open('data.txt'))):
    print(line)

hello python
python for all

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[215], line 1
----> 1 for line in grep('python', tail(open('data.txt'))):
      2     print(line)

Cell In[208], line 5, in grep(search_string, f)
      4 def grep(search_string, f):
----> 5     for line in f:
      6        if search_string in line:
      7              yield line.strip('\n')

Cell In[210], line 5, in tail(f)
      3 line = f.readline()
      4 if not line:
----> 5     time.sleep(1)
      6     continue
      7 yield line

KeyboardInterrupt: 

Exceptions#

1/0

---------------------------------------------------------------------------
ZeroDivisionError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 1/0

ZeroDivisionError: division by zero

try:
    1 / 0
except ZeroDivisionError:
    print("Warning: tried division by zero")

print("Continuing..")

Warning: tried division by zero
Continuing..

Notes

Contents

Notes#

Basics#

Numeric data types#

Strings#

Lists#

Tuples#

Slicing#

Dictionaries#

Repetition#

for-loops#

Branching#

if statements#

Functions#

Modules#

Files#

pathlib module#

csv module#

External libraries#

Matplotlib#

Pandas#

Dataframes#

Notes on the lab#

Interactive plots#

Reproducing gapminder presentation by Hans Rosling#

Update#

make numeric columns#

Classes#

Inheritance#

Testing#

doctest#

Version control#

configuration#

Branches#

Remote repositories#

update: plot savings history#

Advanced topics#

Decorators#

Iterators#

Generators#

Exceptions#