In [1]:
import sympy
import numpy as np
import matplotlib.pyplot as plt
import scipy.special as spsp
Negative Binomial Distribution is another distribution that models i.i.d. Bernoulli trials. It is used to measures the probabilities of $k$ trials in order to getting $r$ number of successes. The probability of success of each Beroulli trial is $p$.
$PMF(k)=\frac{(k-1)!}{(k-r)!(r-1)!}(1-p)^{k-r}p^r, k=r,r+1, \dots$
Q1.
Use Sympy to compute the expected value of $k$.
In [2]:
sympy.init_printing()
k=sympy.Symbol(‘k’, positive=True)
r=sympy.Symbol(‘r’, positive=True)
p=sympy.Symbol(‘p’, positive=True)
PMF_k=sympy.factorial(k-1)/sympy.factorial(k-r)/sympy.factorial(r-1)* (1-p)**(k-r) * p**r
E_k=sympy.Sum(k*PMF_k, (k, r, sympy.oo)).doit().simplify() # min is r
#E_k.args[0][0] # args give you specific expression you want directly, for here, we only want our r and p
E_k
Out[2]:
$\displaystyle \begin{cases} \frac{r}{p} & \text{for}\: \left|{p – 1}\right| < 1 \\\frac{p^{r} \left(1 - p\right)^{- r} \sum_{k=r}^{\infty} \frac{\left(1 - p\right)^{k} \Gamma\left(k + 1\right)}{\Gamma\left(k - r + 1\right)}}{\Gamma\left(r\right)} & \text{otherwise} \end{cases}$
Q2.
Assume $r$=100 and $p=0.4$, if we want to perform a upward/downward search, what is the best starting point?
In [3]:
p=0.4
r=100
k=np.arange(r,300)
PMF=spsp.factorial(k-1)/spsp.factorial(k-r)/spsp.factorial(r-1)*(1-p)**(k-r)*p**r
PMF=spsp.comb(k-1,k-r)*(1-p)**(k-r)*p**r #spsp.comb is much smaller than spsp.factorial, and if we
# comment this line and run the line above and plot it, we could see that it overflow
plt.bar(k,PMF)
PMF=spsp.factorial(k-1)/spsp.factorial(k-r)/spsp.factorial(r-1)*(1-p)**(k-r)*p**r
Out[3]:

In [4]:
k0=sympy.Symbol(“k0”)
#downward search
Part1=sympy.Sum( (k0+2-k)*PMF_k ,(k, r, k0)).doit()
#k0 will be our starting value of search
#upward search
Part2=sympy.Sum( (k-k0+1)*PMF_k ,(k, k0+1, sympy.oo)).doit()
Es=Part1+Part2
Es
—————————————————————————
ValueError Traceback (most recent call last)
1 k0=sympy.Symbol(“k0”)
2 #downward search
—-> 3 Part1=sympy.Sum( (k0+2-k)*PMF_k ,(k, r, k0)).doit()
4 #k0 will be our starting value of search
5 #upward search
/usr/lib/python3.9/site-packages/sympy/concrete/summations.py in __new__(cls, function, *symbols, **assumptions)
156
157 def __new__(cls, function, *symbols, **assumptions):
–> 158 obj = AddWithLimits.__new__(cls, function, *symbols, **assumptions)
159 if not hasattr(obj, ‘limits’):
160 return obj
/usr/lib/python3.9/site-packages/sympy/concrete/expr_with_limits.py in __new__(cls, function, *symbols, **assumptions)
492
493 def __new__(cls, function, *symbols, **assumptions):
–> 494 pre = _common_new(cls, function, *symbols, **assumptions)
495 if type(pre) is tuple:
496 function, limits, orientation = pre
/usr/lib/python3.9/site-packages/sympy/concrete/expr_with_limits.py in _common_new(cls, function, *symbols, **assumptions)
46
47 if symbols:
—> 48 limits, orientation = _process_limits(*symbols)
49 for i, li in enumerate(limits):
50 if len(li) == 4:
/usr/lib/python3.9/site-packages/sympy/concrete/expr_with_limits.py in _process_limits(*symbols)
154 continue
155
–> 156 raise ValueError(‘Invalid limits given: %s’ % str(symbols))
157
158 return limits, orientation
ValueError: Invalid limits given: ((array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177,
178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216,
217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
295, 296, 297, 298, 299]), 100, k0),)
In [5]:
# Inside this array, which one is the smallest number and we will find the corresponding k0 value
k0_array=np.arange(101,300)
Es_array=np.array([sympy.N(Es.subs({r:100,p:0.4,k0:k0_v})) for k0_v in k0_array])
In [6]:
# best starting value
k0_array[np.argmin(Es_array)]
Out[6]:
249
Q3
Implement the algorithm using the best starting point
In [7]:
(PMF_k/PMF_k.subs({k:k-1})).simplify()
Out[7]:
$\displaystyle \frac{\left(k – 1\right) \left(p – 1\right)}{- k + r}$
In [8]:
# x is k
def start():
start_x=249
#compute the starting point outside to improve efficiency
p=0.4
r=100
k=r # we need to start from the beginning, if we start from 249, we could only get pmf corresponding to 249
pmf=spsp.comb(k-1,k-r)*(1-p)**(k-r)*p**r
cdf=pmf
for k in range(r+1, start_x+1):
ratio=(k-1)*(p-1)/(r-k) # the ratio is from (PMF_k/PMF_k.subs({k:k-1})).simplify()
pmf=pmf*ratio
cdf=cdf+pmf
return cdf,pmf
def nb_sampling(cdf_start, pmf_start):
#starting value of search
x=249
p=0.4
r=100
# cdf corresponding to the starting value?
cdf=cdf_start
pmf=pmf_start
# compare u with cdf
u=np.random.rand()
# if u>= cdf: upward
if u>=cdf:
x=x+1
ratio=(x-1)*(p-1)/(r-x)
pmf=pmf*ratio
cdf=cdf+pmf
while u>=cdf:
x=x+1
ratio=(x-1)*(p-1)/(r-x)
pmf=pmf*ratio
cdf=cdf+pmf
return x
#else downward
else:
cdf=cdf-pmf
while u